
BlueBench Leaderboard
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxtโs abilities for dynamic and flexible text processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
{
- "headers": [
- "Model",
- "Average",
- "Bias",
- "Chatbot Abilities",
- "Entity Extraction",
- "Knowledge",
- "Legal",
- "News Classification",
- "Product Help",
- "QA Fianace",
- "RAG General",
- "Reasoning",
- "Safety",
- "Summarization",
- "Translation"
- "data": [
- [
- "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
- 63.69,
- 100,
- 87.12,
- 72.97,
- 55.1,
- 62.93,
- 67.01,
- 84.75,
- 26,
- 49.6,
- 75,
- 85.86,
- 19.16,
- 42.43
- [
- "watsonx/meta-llama/llama-3-405b-instruct",
- 61.6,
- 100,
- 43.21,
- 79.75,
- 66.33,
- 64.91,
- 66.3,
- 82.59,
- 27,
- 53.17,
- 75,
- 81.35,
- 19.07,
- 42.14
- [
- "watsonx/mistralai/mistral-medium-2505",
- 60.5,
- 93.94,
- 94.17,
- 74.21,
- 44.9,
- 67.17,
- 53.76,
- 79.65,
- 33,
- 52.57,
- 75.5,
- 84.68,
- 17.43,
- 15.52
- [
- "watsonx/meta-llama/llama-3-3-70b-instruct",
- 58.96,
- 95.96,
- 49.69,
- 70.18,
- 53.06,
- 59.54,
- 59.89,
- 86.64,
- 24,
- 49.02,
- 73.5,
- 83.8,
- 19.47,
- 41.71
- [
- "watsonx/mistralai/mistral-large",
- 55.49,
- 96.97,
- 86.13,
- 61.18,
- 55.1,
- 34.71,
- 29.01,
- 67.02,
- 19,
- 51.08,
- 75,
- 88.04,
- 18.24,
- 39.83
- [
- "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
- 54.26,
- 79.8,
- 60.26,
- 60.87,
- 40.82,
- 61,
- 53.93,
- 76.98,
- 16,
- 49.57,
- 68.5,
- 84.18,
- 18.67,
- 34.85
- [
- "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
- 54.11,
- 93.94,
- 88.04,
- 38.22,
- 55.1,
- 39.1,
- 56.1,
- 72.02,
- 25,
- 45.72,
- 73,
- 87.62,
- 16.06,
- 13.55
- [
- "watsonx/ibm/granite-3-8b-instruct",
- 49.69,
- 83.84,
- 51.32,
- 30.3,
- 43.88,
- 49.73,
- 50,
- 75.04,
- 16,
- 53.97,
- 65.5,
- 88.03,
- 18.02,
- 20.38
- [
- "watsonx/ibm/granite-3-3-8b-instruct",
- 49.29,
- 76.77,
- 60.78,
- 44.27,
- 32.65,
- 55.94,
- 46.07,
- 73.88,
- 9,
- 51.92,
- 62,
- 88.44,
- 17.15,
- 21.86
- [
- "watsonx/mistralai/pixtral-12b",
- 45.38,
- 72.73,
- 76.54,
- 22,
- 38.78,
- 32.9,
- 21.54,
- 59.87,
- 12,
- 47.85,
- 73.5,
- 82.45,
- 17.28,
- 32.54
- [
- "watsonx/meta-llama/llama-3-2-3b-instruct",
- 44.27,
- 60.61,
- 25.14,
- 45.26,
- 35.71,
- 50.56,
- 36.47,
- 75.58,
- 10,
- 47.81,
- 56,
- 83.37,
- 17.99,
- 31.01
- [
- "watsonx/ibm/granite-3-2b-instruct",
- 40.99,
- 46.46,
- 30.99,
- 26.58,
- 29.59,
- 49.21,
- 35.8,
- 70.72,
- 13,
- 46.34,
- 58.5,
- 84.87,
- 17.23,
- 23.53
- [
- "watsonx/meta-llama/llama-3-2-1b-instruct",
- 31.83,
- 48.48,
- 5.7,
- 20.69,
- 19.39,
- 37.26,
- 20,
- 45.62,
- 7,
- 44.7,
- 41,
- 83,
- 17.62,
- 23.29
- [
- "metadata": null