
BlueBench Leaderboard
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users. It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxtโs abilities for dynamic and flexible text processing. As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
{
- "headers": [
- "Model",
- "Average",
- "Bias",
- "Chatbot Abilities",
- "Entity Extraction",
- "Knowledge",
- "Legal",
- "News Classification",
- "Product Help",
- "QA Fianace",
- "RAG General",
- "Reasoning",
- "Safety",
- "Summarization",
- "Translation"
- "data": [
- [
- "openai/gpt-4.1",
- 64.61,
- 97.98,
- 96.36,
- 72.39,
- 58.16,
- 60.47,
- 66.29,
- 82.33,
- 33,
- 48.46,
- 77.5,
- 87.49,
- 18.25,
- 41.22
- [
- "openai/gpt-4o",
- 64.13,
- 93.94,
- 94.14,
- 73.62,
- 45.92,
- 60.87,
- 69.07,
- 85.95,
- 37,
- 48.9,
- 78,
- 87.84,
- 17.06,
- 41.42
- [
- "openai/gpt-4.1-mini",
- 63.01,
- 94.95,
- 97.55,
- 73.17,
- 42.86,
- 59.32,
- 66.67,
- 84.95,
- 28,
- 54.58,
- 73,
- 85.96,
- 17.92,
- 40.16
- [
- "meta-llama/llama-3-405b-instruct",
- 61.6,
- 100,
- 43.21,
- 79.75,
- 66.33,
- 64.91,
- 66.3,
- 82.59,
- 27,
- 53.17,
- 75,
- 81.35,
- 19.07,
- 42.14
- [
- "mistralai/mistral-medium-2505",
- 60.5,
- 93.94,
- 94.17,
- 74.21,
- 44.9,
- 67.17,
- 53.76,
- 79.65,
- 33,
- 52.57,
- 75.5,
- 84.68,
- 17.43,
- 15.52
- [
- "meta-llama/llama-3-3-70b-instruct",
- 58.96,
- 95.96,
- 49.69,
- 70.18,
- 53.06,
- 59.54,
- 59.89,
- 86.64,
- 24,
- 49.02,
- 73.5,
- 83.8,
- 19.47,
- 41.71
- [
- "openai/o1",
- 58.6,
- 94.95,
- 34.38,
- 79.22,
- 71.43,
- 51.22,
- 68.16,
- 83,
- 22,
- 41.24,
- 79,
- 82.96,
- 16.77,
- 37.44
- [
- "openai/gpt-4.1-nano",
- 56.98,
- 81.82,
- 95.57,
- 61.11,
- 44.9,
- 56.91,
- 42.16,
- 68.52,
- 26,
- 50.88,
- 69,
- 86.67,
- 16.94,
- 40.29
- [
- "mistralai/mistral-large",
- 55.49,
- 96.97,
- 86.13,
- 61.18,
- 55.1,
- 34.71,
- 29.01,
- 67.02,
- 19,
- 51.08,
- 75,
- 88.04,
- 18.24,
- 39.83
- [
- "mistralai/mistral-small-3-1-24b-instruct-2503",
- 54.11,
- 93.94,
- 88.04,
- 38.22,
- 55.1,
- 39.1,
- 56.1,
- 72.02,
- 25,
- 45.72,
- 73,
- 87.62,
- 16.06,
- 13.55
- [
- "openai/o4-mini",
- 53.98,
- 94.95,
- 11.39,
- 75.47,
- 40.82,
- 56.4,
- 67.37,
- 84.27,
- 22,
- 40.32,
- 75.5,
- 83.55,
- 16.3,
- 33.43
- [
- "openai/o3-mini",
- 53.85,
- 97.98,
- 37.11,
- 72.86,
- 50,
- 37.87,
- 43.06,
- 86.08,
- 23,
- 44.06,
- 73,
- 80.61,
- 16.66,
- 37.79
- [
- "ibm/granite-3-8b-instruct",
- 49.69,
- 83.84,
- 51.32,
- 30.3,
- 43.88,
- 49.73,
- 50,
- 75.04,
- 16,
- 53.97,
- 65.5,
- 88.03,
- 18.02,
- 20.38
- [
- "ibm/granite-3-3-8b-instruct",
- 49.29,
- 76.77,
- 60.78,
- 44.27,
- 32.65,
- 55.94,
- 46.07,
- 73.88,
- 9,
- 51.92,
- 62,
- 88.44,
- 17.15,
- 21.86
- [
- "mistralai/pixtral-12b",
- 45.38,
- 72.73,
- 76.54,
- 22,
- 38.78,
- 32.9,
- 21.54,
- 59.87,
- 12,
- 47.85,
- 73.5,
- 82.45,
- 17.28,
- 32.54
- [
- "meta-llama/llama-3-2-3b-instruct",
- 44.27,
- 60.61,
- 25.14,
- 45.26,
- 35.71,
- 50.56,
- 36.47,
- 75.58,
- 10,
- 47.81,
- 56,
- 83.37,
- 17.99,
- 31.01
- [
- "ibm/granite-3-2-b-instruct",
- 40.99,
- 46.46,
- 30.99,
- 26.58,
- 29.59,
- 49.21,
- 35.8,
- 70.72,
- 13,
- 46.34,
- 58.5,
- 84.87,
- 17.23,
- 23.53
- [
- "meta-llama/llama-3-2-1b-instruct",
- 31.83,
- 48.48,
- 5.7,
- 20.69,
- 19.39,
- 37.26,
- 20,
- 45.62,
- 7,
- 44.7,
- 41,
- 83,
- 17.62,
- 23.29
- [
- "metadata": null