LLM-Based Automated Literature Review Evaluation Benchmark

This leaderboard evaluates Large Language Models (LLMs) on their ability to perform automated literature review tasks, including reference generation, abstract writing, and review composition.
It is based on the study: Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition.
The leaderboard measures how well different models perform in references generation, factually consistent, and stylistically appropriate academic texts.

{
  • "headers": [
    • "T",
    • "Model",
    • "Average โฌ†๏ธ",
    • "(T1) Title Search Rate (%)",
    • "(T1) Precision (%)",
    • "(T1) Overlap (%)",
    • "(T1) Precision (First Author) (%)",
    • "(T1) Overlap (First Author) (%)",
    • "(T2) Similarity (%)",
    • "(T2) Entail (TRUE %)",
    • "(T2) Entail (GPT-4o %)",
    • "(T2) ROUGE-1 (%)",
    • "(T2) ROUGE-2 (%)",
    • "(T2) ROUGE-L (%)",
    • "(T3) Precision (%)",
    • "(T3) Title Search Rate (%)",
    • "(T3) Overlap (%)",
    • "(T3) KPR (%)",
    • "(T3) ROUGE-1 (%)",
    • "(T3) ROUGE-2 (%)",
    • "(T3) ROUGE-L (%)",
    • "Type",
    • "Architecture",
    • "Precision",
    • "Hub License",
    • "#Params (B)",
    • "Hub โค๏ธ",
    • "Available on the hub",
    • "Model sha"
    ],
  • "data": [
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/Anthropic/Claude-3.5-Sonnet" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">Anthropic/Claude-3.5-Sonnet</a>",
      • 45.57,
      • 64.82,
      • 51.59,
      • 24.34,
      • 55.77,
      • 25.21,
      • 81.17,
      • 78.9,
      • 96.77,
      • 41.13,
      • 8.99,
      • 20,
      • 59.06,
      • 66.43,
      • 31.9,
      • 62.32,
      • 28.59,
      • 8.9,
      • 14.41,
      • "",
      • "?",
      • "float16",
      • "?",
      • 0,
      • 0,
      • false,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/deepseek-ai/DeepSeek-V3" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">deepseek-ai/DeepSeek-V3</a>",
      • 43.41,
      • 56.04,
      • 46.33,
      • 19.72,
      • 50.66,
      • 20.5,
      • 80.96,
      • 78.55,
      • 96.84,
      • 41.13,
      • 8.98,
      • 20.33,
      • 52.81,
      • 62.29,
      • 26.79,
      • 56.02,
      • 35.65,
      • 10.4,
      • 17.46,
      • "",
      • "DeepseekV3ForCausalLM",
      • "float16",
      • "?",
      • 0,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/OpenAI/GPT-4o" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/GPT-4o</a>",
      • 38.6,
      • 32.07,
      • 21.65,
      • 18.76,
      • 24.65,
      • 19.5,
      • 80.96,
      • 77.91,
      • 96.5,
      • 40.7,
      • 8.56,
      • 19.86,
      • 50.62,
      • 60.05,
      • 27.88,
      • 59.18,
      • 30.78,
      • 9.72,
      • 15.54,
      • "",
      • "?",
      • "float16",
      • "?",
      • 0,
      • 0,
      • false,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-72B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">Qwen/Qwen2.5-72B-Instruct</a>",
      • 31.72,
      • 21.8,
      • 12.25,
      • 12.6,
      • 17.58,
      • 13.27,
      • 80.22,
      • 69.52,
      • 95.02,
      • 40.61,
      • 8.78,
      • 20.12,
      • 28.91,
      • 40.02,
      • 17.36,
      • 38.82,
      • 29.95,
      • 9.01,
      • 15.14,
      • "",
      • "Qwen2ForCausalLM",
      • "float16",
      • "?",
      • 0,
      • 0,
      • true,
      • "main"
      ],
    • [
      • "?",
      • "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">meta-llama/Llama-3.2-3B-Instruct</a>",
      • 25.73,
      • 16.62,
      • 3.45,
      • 8.48,
      • 6.95,
      • 8.67,
      • 79.28,
      • 62.39,
      • 92.14,
      • 40.35,
      • 8.96,
      • 20.52,
      • 4.96,
      • 21.78,
      • 8.28,
      • 15.46,
      • 29.07,
      • 28.07,
      • 7.77,
      • "",
      • "LlamaForCausalLM",
      • "float16",
      • "?",
      • 0,
      • 0,
      • true,
      • "main"
      ]
    ],
  • "metadata": null
}