BlueBench Leaderboard

BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.

{

"headers": [
- "Model",
- "Average",
- "Bias",
- "Chatbot Abilities",
- "Entity Extraction",
- "Knowledge",
- "Legal",
- "News Classification",
- "Product Help",
- "QA Fianace",
- "RAG General",
- "Reasoning",
- "Safety",
- "Summarization",
- "Translation"
],
"data": [
- [
  - "openai/gpt-4.1",
  - 64.61,
  - 97.98,
  - 96.36,
  - 72.39,
  - 58.16,
  - 60.47,
  - 66.29,
  - 82.33,
  - 33,
  - 48.46,
  - 77.5,
  - 87.49,
  - 18.25,
  - 41.22
  ],
- [
  - "openai/gpt-4o",
  - 64.13,
  - 93.94,
  - 94.14,
  - 73.62,
  - 45.92,
  - 60.87,
  - 69.07,
  - 85.95,
  - 37,
  - 48.9,
  - 78,
  - 87.84,
  - 17.06,
  - 41.42
  ],
- [
  - "openai/gpt-4.1-mini",
  - 63.01,
  - 94.95,
  - 97.55,
  - 73.17,
  - 42.86,
  - 59.32,
  - 66.67,
  - 84.95,
  - 28,
  - 54.58,
  - 73,
  - 85.96,
  - 17.92,
  - 40.16
  ],
- [
  - "meta-llama/llama-3-405b-instruct",
  - 61.6,
  - 100,
  - 43.21,
  - 79.75,
  - 66.33,
  - 64.91,
  - 66.3,
  - 82.59,
  - 27,
  - 53.17,
  - 75,
  - 81.35,
  - 19.07,
  - 42.14
  ],
- [
  - "mistralai/mistral-medium-2505",
  - 60.5,
  - 93.94,
  - 94.17,
  - 74.21,
  - 44.9,
  - 67.17,
  - 53.76,
  - 79.65,
  - 33,
  - 52.57,
  - 75.5,
  - 84.68,
  - 17.43,
  - 15.52
  ],
- [
  - "meta-llama/llama-3-3-70b-instruct",
  - 58.96,
  - 95.96,
  - 49.69,
  - 70.18,
  - 53.06,
  - 59.54,
  - 59.89,
  - 86.64,
  - 24,
  - 49.02,
  - 73.5,
  - 83.8,
  - 19.47,
  - 41.71
  ],
- [
  - "openai/o1",
  - 58.6,
  - 94.95,
  - 34.38,
  - 79.22,
  - 71.43,
  - 51.22,
  - 68.16,
  - 83,
  - 22,
  - 41.24,
  - 79,
  - 82.96,
  - 16.77,
  - 37.44
  ],
- [
  - "openai/gpt-4.1-nano",
  - 56.98,
  - 81.82,
  - 95.57,
  - 61.11,
  - 44.9,
  - 56.91,
  - 42.16,
  - 68.52,
  - 26,
  - 50.88,
  - 69,
  - 86.67,
  - 16.94,
  - 40.29
  ],
- [
  - "mistralai/mistral-large",
  - 55.49,
  - 96.97,
  - 86.13,
  - 61.18,
  - 55.1,
  - 34.71,
  - 29.01,
  - 67.02,
  - 19,
  - 51.08,
  - 75,
  - 88.04,
  - 18.24,
  - 39.83
  ],
- [
  - "mistralai/mistral-small-3-1-24b-instruct-2503",
  - 54.11,
  - 93.94,
  - 88.04,
  - 38.22,
  - 55.1,
  - 39.1,
  - 56.1,
  - 72.02,
  - 25,
  - 45.72,
  - 73,
  - 87.62,
  - 16.06,
  - 13.55
  ],
- [
  - "openai/o4-mini",
  - 53.98,
  - 94.95,
  - 11.39,
  - 75.47,
  - 40.82,
  - 56.4,
  - 67.37,
  - 84.27,
  - 22,
  - 40.32,
  - 75.5,
  - 83.55,
  - 16.3,
  - 33.43
  ],
- [
  - "openai/o3-mini",
  - 53.85,
  - 97.98,
  - 37.11,
  - 72.86,
  - 50,
  - 37.87,
  - 43.06,
  - 86.08,
  - 23,
  - 44.06,
  - 73,
  - 80.61,
  - 16.66,
  - 37.79
  ],
- [
  - "ibm/granite-3-8b-instruct",
  - 49.69,
  - 83.84,
  - 51.32,
  - 30.3,
  - 43.88,
  - 49.73,
  - 50,
  - 75.04,
  - 16,
  - 53.97,
  - 65.5,
  - 88.03,
  - 18.02,
  - 20.38
  ],
- [
  - "ibm/granite-3-3-8b-instruct",
  - 49.29,
  - 76.77,
  - 60.78,
  - 44.27,
  - 32.65,
  - 55.94,
  - 46.07,
  - 73.88,
  - 9,
  - 51.92,
  - 62,
  - 88.44,
  - 17.15,
  - 21.86
  ],
- [
  - "mistralai/pixtral-12b",
  - 45.38,
  - 72.73,
  - 76.54,
  - 22,
  - 38.78,
  - 32.9,
  - 21.54,
  - 59.87,
  - 12,
  - 47.85,
  - 73.5,
  - 82.45,
  - 17.28,
  - 32.54
  ],
- [
  - "meta-llama/llama-3-2-3b-instruct",
  - 44.27,
  - 60.61,
  - 25.14,
  - 45.26,
  - 35.71,
  - 50.56,
  - 36.47,
  - 75.58,
  - 10,
  - 47.81,
  - 56,
  - 83.37,
  - 17.99,
  - 31.01
  ],
- [
  - "ibm/granite-3-2-b-instruct",
  - 40.99,
  - 46.46,
  - 30.99,
  - 26.58,
  - 29.59,
  - 49.21,
  - 35.8,
  - 70.72,
  - 13,
  - 46.34,
  - 58.5,
  - 84.87,
  - 17.23,
  - 23.53
  ],
- [
  - "meta-llama/llama-3-2-1b-instruct",
  - 31.83,
  - 48.48,
  - 5.7,
  - 20.69,
  - 19.39,
  - 37.26,
  - 20,
  - 45.62,
  - 7,
  - 44.7,
  - 41,
  - 83,
  - 17.62,
  - 23.29
  ]
],
"metadata": null

}