BlueBench Leaderboard

BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.

{

"headers": [
- "Model",
- "Average",
- "Bias",
- "Chatbot Abilities",
- "Entity Extraction",
- "Knowledge",
- "Legal",
- "News Classification",
- "Product Help",
- "QA Fianace",
- "RAG General",
- "Reasoning",
- "Safety",
- "Summarization",
- "Translation"
],
"data": [
- [
  - "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
  - 63.69,
  - 100,
  - 87.12,
  - 72.97,
  - 55.1,
  - 62.93,
  - 67.01,
  - 84.75,
  - 26,
  - 49.6,
  - 75,
  - 85.86,
  - 19.16,
  - 42.43
  ],
- [
  - "watsonx/meta-llama/llama-3-405b-instruct",
  - 61.6,
  - 100,
  - 43.21,
  - 79.75,
  - 66.33,
  - 64.91,
  - 66.3,
  - 82.59,
  - 27,
  - 53.17,
  - 75,
  - 81.35,
  - 19.07,
  - 42.14
  ],
- [
  - "watsonx/mistralai/mistral-medium-2505",
  - 60.5,
  - 93.94,
  - 94.17,
  - 74.21,
  - 44.9,
  - 67.17,
  - 53.76,
  - 79.65,
  - 33,
  - 52.57,
  - 75.5,
  - 84.68,
  - 17.43,
  - 15.52
  ],
- [
  - "watsonx/meta-llama/llama-3-3-70b-instruct",
  - 58.96,
  - 95.96,
  - 49.69,
  - 70.18,
  - 53.06,
  - 59.54,
  - 59.89,
  - 86.64,
  - 24,
  - 49.02,
  - 73.5,
  - 83.8,
  - 19.47,
  - 41.71
  ],
- [
  - "watsonx/mistralai/mistral-large",
  - 55.49,
  - 96.97,
  - 86.13,
  - 61.18,
  - 55.1,
  - 34.71,
  - 29.01,
  - 67.02,
  - 19,
  - 51.08,
  - 75,
  - 88.04,
  - 18.24,
  - 39.83
  ],
- [
  - "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
  - 54.26,
  - 79.8,
  - 60.26,
  - 60.87,
  - 40.82,
  - 61,
  - 53.93,
  - 76.98,
  - 16,
  - 49.57,
  - 68.5,
  - 84.18,
  - 18.67,
  - 34.85
  ],
- [
  - "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
  - 54.11,
  - 93.94,
  - 88.04,
  - 38.22,
  - 55.1,
  - 39.1,
  - 56.1,
  - 72.02,
  - 25,
  - 45.72,
  - 73,
  - 87.62,
  - 16.06,
  - 13.55
  ],
- [
  - "watsonx/ibm/granite-3-8b-instruct",
  - 49.69,
  - 83.84,
  - 51.32,
  - 30.3,
  - 43.88,
  - 49.73,
  - 50,
  - 75.04,
  - 16,
  - 53.97,
  - 65.5,
  - 88.03,
  - 18.02,
  - 20.38
  ],
- [
  - "watsonx/ibm/granite-3-3-8b-instruct",
  - 49.29,
  - 76.77,
  - 60.78,
  - 44.27,
  - 32.65,
  - 55.94,
  - 46.07,
  - 73.88,
  - 9,
  - 51.92,
  - 62,
  - 88.44,
  - 17.15,
  - 21.86
  ],
- [
  - "watsonx/mistralai/pixtral-12b",
  - 45.38,
  - 72.73,
  - 76.54,
  - 22,
  - 38.78,
  - 32.9,
  - 21.54,
  - 59.87,
  - 12,
  - 47.85,
  - 73.5,
  - 82.45,
  - 17.28,
  - 32.54
  ],
- [
  - "watsonx/meta-llama/llama-3-2-3b-instruct",
  - 44.27,
  - 60.61,
  - 25.14,
  - 45.26,
  - 35.71,
  - 50.56,
  - 36.47,
  - 75.58,
  - 10,
  - 47.81,
  - 56,
  - 83.37,
  - 17.99,
  - 31.01
  ],
- [
  - "watsonx/ibm/granite-3-2b-instruct",
  - 40.99,
  - 46.46,
  - 30.99,
  - 26.58,
  - 29.59,
  - 49.21,
  - 35.8,
  - 70.72,
  - 13,
  - 46.34,
  - 58.5,
  - 84.87,
  - 17.23,
  - 23.53
  ],
- [
  - "watsonx/meta-llama/llama-3-2-1b-instruct",
  - 31.83,
  - 48.48,
  - 5.7,
  - 20.69,
  - 19.39,
  - 37.26,
  - 20,
  - 45.62,
  - 7,
  - 44.7,
  - 41,
  - 83,
  - 17.62,
  - 23.29
  ]
],
"metadata": null

}