BlueBench Leaderboard

BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxtโ€™s abilities for dynamic and flexible text processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.

{
  • "headers": [
    • "Model",
    • "Average",
    • "Bias",
    • "Chatbot Abilities",
    • "Entity Extraction",
    • "Knowledge",
    • "Legal",
    • "News Classification",
    • "Product Help",
    • "QA Fianace",
    • "RAG General",
    • "Reasoning",
    • "Safety",
    • "Summarization",
    • "Translation"
    ],
  • "data": [
    • [
      • "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
      • 63.69,
      • 100,
      • 87.12,
      • 72.97,
      • 55.1,
      • 62.93,
      • 67.01,
      • 84.75,
      • 26,
      • 49.6,
      • 75,
      • 85.86,
      • 19.16,
      • 42.43
      ],
    • [
      • "watsonx/meta-llama/llama-3-405b-instruct",
      • 61.6,
      • 100,
      • 43.21,
      • 79.75,
      • 66.33,
      • 64.91,
      • 66.3,
      • 82.59,
      • 27,
      • 53.17,
      • 75,
      • 81.35,
      • 19.07,
      • 42.14
      ],
    • [
      • "watsonx/mistralai/mistral-medium-2505",
      • 60.5,
      • 93.94,
      • 94.17,
      • 74.21,
      • 44.9,
      • 67.17,
      • 53.76,
      • 79.65,
      • 33,
      • 52.57,
      • 75.5,
      • 84.68,
      • 17.43,
      • 15.52
      ],
    • [
      • "watsonx/meta-llama/llama-3-3-70b-instruct",
      • 58.96,
      • 95.96,
      • 49.69,
      • 70.18,
      • 53.06,
      • 59.54,
      • 59.89,
      • 86.64,
      • 24,
      • 49.02,
      • 73.5,
      • 83.8,
      • 19.47,
      • 41.71
      ],
    • [
      • "watsonx/mistralai/mistral-large",
      • 55.49,
      • 96.97,
      • 86.13,
      • 61.18,
      • 55.1,
      • 34.71,
      • 29.01,
      • 67.02,
      • 19,
      • 51.08,
      • 75,
      • 88.04,
      • 18.24,
      • 39.83
      ],
    • [
      • "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
      • 54.26,
      • 79.8,
      • 60.26,
      • 60.87,
      • 40.82,
      • 61,
      • 53.93,
      • 76.98,
      • 16,
      • 49.57,
      • 68.5,
      • 84.18,
      • 18.67,
      • 34.85
      ],
    • [
      • "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
      • 54.11,
      • 93.94,
      • 88.04,
      • 38.22,
      • 55.1,
      • 39.1,
      • 56.1,
      • 72.02,
      • 25,
      • 45.72,
      • 73,
      • 87.62,
      • 16.06,
      • 13.55
      ],
    • [
      • "watsonx/ibm/granite-3-8b-instruct",
      • 49.69,
      • 83.84,
      • 51.32,
      • 30.3,
      • 43.88,
      • 49.73,
      • 50,
      • 75.04,
      • 16,
      • 53.97,
      • 65.5,
      • 88.03,
      • 18.02,
      • 20.38
      ],
    • [
      • "watsonx/ibm/granite-3-3-8b-instruct",
      • 49.29,
      • 76.77,
      • 60.78,
      • 44.27,
      • 32.65,
      • 55.94,
      • 46.07,
      • 73.88,
      • 9,
      • 51.92,
      • 62,
      • 88.44,
      • 17.15,
      • 21.86
      ],
    • [
      • "watsonx/mistralai/pixtral-12b",
      • 45.38,
      • 72.73,
      • 76.54,
      • 22,
      • 38.78,
      • 32.9,
      • 21.54,
      • 59.87,
      • 12,
      • 47.85,
      • 73.5,
      • 82.45,
      • 17.28,
      • 32.54
      ],
    • [
      • "watsonx/meta-llama/llama-3-2-3b-instruct",
      • 44.27,
      • 60.61,
      • 25.14,
      • 45.26,
      • 35.71,
      • 50.56,
      • 36.47,
      • 75.58,
      • 10,
      • 47.81,
      • 56,
      • 83.37,
      • 17.99,
      • 31.01
      ],
    • [
      • "watsonx/ibm/granite-3-2b-instruct",
      • 40.99,
      • 46.46,
      • 30.99,
      • 26.58,
      • 29.59,
      • 49.21,
      • 35.8,
      • 70.72,
      • 13,
      • 46.34,
      • 58.5,
      • 84.87,
      • 17.23,
      • 23.53
      ],
    • [
      • "watsonx/meta-llama/llama-3-2-1b-instruct",
      • 31.83,
      • 48.48,
      • 5.7,
      • 20.69,
      • 19.39,
      • 37.26,
      • 20,
      • 45.62,
      • 7,
      • 44.7,
      • 41,
      • 83,
      • 17.62,
      • 23.29
      ]
    ],
  • "metadata": null
}