You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+76-79Lines changed: 76 additions & 79 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -28,14 +28,16 @@ Originally developed within the [AIenhancedWork](https://github.com/LSeu-Open/AI
28
28
-**Community score** (20 points max)
29
29
-**Technical specifications** (20 points max)
30
30
31
-
The final score is calculated out of 100 points (if you want to have a detailed breakdown of the scoring framework, please refer to the [scoring_framework.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/Scoring/scoring_framework.md) file).
31
+
The final score is calculated out of 100 points (if you want to have a detailed breakdown of the scoring framework, please refer to the [scoring_framework_development_notes.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/Scoring/dev_ideas/scoring_framework_development_notes.md) file).
32
32
33
33
Please note that this is a beta version and the scoring system is subject to change.
34
34
35
35
To help us refine and improve LLMScoreEngine during this beta phase, we actively encourage user feedback, bug reports, and contributions to help us refine and improve LLMScoreEngine. Please feel free to [open an issue](https://github.com/LSeu-Open/LLMScoreEngine/issues) or [contribute](CONTRIBUTING.md) to the project. Make sure to respect the [Code of Conduct](CODE_OF_CONDUCT.md).
36
36
37
-
> [!NOTE]
38
-
> Following the v0.5.0 update, which introduced modular configuration capabilities, we are now developing a dedicated version of this evaluation system tailored for Vision Language Models (VLMs). This development is ongoing in its [dedicated Repository](https://github.com/LSeu-Open/VLMScoreEngine).
37
+
## New features in Beta v0.6
38
+
39
+
- Updated technical score calculation to take into account the input and output price of the model.
40
+
- Added a new CLI option to generate a graph report from the existing csv report.
39
41
40
42
## Project Structure
41
43
@@ -53,6 +55,8 @@ LLMScoreEngine/
53
55
│ │ └── models_scoring.py
54
56
│ ├── utils/ # Utility functions
55
57
│ │ ├── config_loader.py
58
+
│ │ ├── csv_reporter.py
59
+
│ │ └── graph_reporter.py
56
60
│ │ └── logging.py
57
61
│ ├── __init__.py
58
62
│ └── run_scoring.py # Script for running scoring programmatically
@@ -67,7 +71,9 @@ LLMScoreEngine/
67
71
│ │ ├── test_hf_score.py
68
72
│ │ └── test_models_scoring.py
69
73
│ ├── utils/
70
-
│ │ └── test_config_loader.py
74
+
│ │ ├── test_config_loader.py
75
+
│ │ └── test_csv_reporter.py
76
+
│ │ └── test_graph_reporter.py
71
77
│ ├── __init__.py
72
78
│ └── test_run_scoring.py
73
79
├── LICENSE # Project license file
@@ -203,7 +209,8 @@ Models data should be stored as JSON files in the `Models` directory, with the f
203
209
"hf_score": null
204
210
},
205
211
"model_specs": {
206
-
"price": null,
212
+
"input_price": null,
213
+
"output_price": null,
207
214
"context_window": null,
208
215
"param_count": null,
209
216
"architecture": null
@@ -293,6 +300,7 @@ You can customize the scoring process with the following optional flags:
293
300
|`--quiet`| Suppress all informational output and only print the final scores in the console. Useful for scripting. |`python score_models.py --all --quiet`|
294
301
|`--config <path>`| Path to a custom Python configuration file to override the default scoring parameters. |`python score_models.py ModelName --config my_config.py`|
295
302
|`--csv`| Generate a CSV report from existing results. |`python score_models.py --csv`|
303
+
|`--graph`| Generate a graph report from existing csv report. |`python score_models.py --graph`|
296
304
297
305
### IDE Usage
298
306
@@ -304,87 +312,76 @@ Results will be stored as JSON files in the `Results` directory, with the follow
304
312
305
313
```json
306
314
{
307
-
"model_name": "Deepseek-R1",
308
-
"scores": {
309
-
"entity_score": 18.84257142857143,
310
-
"dev_score": 23.063999999999997,
311
-
"external_score": 41.906571428571425,
312
-
"community_score": 16.76,
313
-
"technical_score": 16.95878387917363,
314
-
"final_score": 75.63,
315
-
"avg_performance": 73.21368421052631
315
+
"entity_benchmarks": {
316
+
"artificial_analysis": 60.22,
317
+
"OpenCompass": 86.7,
318
+
"LLM Explorer": 59.0,
319
+
"Livebench": 72.49,
320
+
"open_llm": null,
321
+
"UGI Leaderboard": 55.65,
322
+
"big_code_bench": 35.1,
323
+
"EvalPlus Leaderboard": null,
324
+
"Dubesord_LLM": 70.5,
325
+
"Open VLM": null
316
326
},
317
-
"input_data": {
318
-
"entity_benchmarks": {
319
-
"artificial_analysis": 0.6022,
320
-
"OpenCompass": 0.867,
321
-
"LLM Explorer": 0.59,
322
-
"Livebench": 0.7249,
323
-
"open_llm": null,
324
-
"UGI Leaderboard": 0.5565,
325
-
"big_code_bench": 0.35100000000000003,
326
-
"EvalPlus Leaderboard": null,
327
-
"Dubesord_LLM": 0.705,
328
-
"Open VLM": null
329
-
},
330
-
"dev_benchmarks": {
331
-
"MMLU": 0.9079999999999999,
332
-
"MMLU Pro": 0.84,
333
-
"BigBenchHard": null,
334
-
"GPQA diamond": 0.715,
335
-
"DROP": 0.922,
336
-
"HellaSwag": null,
337
-
"Humanity's Last Exam": null,
338
-
"ARC-C": null,
339
-
"Wild Bench": null,
340
-
"MT-bench": null,
341
-
"IFEval": 0.833,
342
-
"Arena-Hard": 0.9229999999999999,
343
-
"MATH": 0.973,
344
-
"GSM-8K": null,
345
-
"AIME": 0.7979999999999999,
346
-
"HumanEval": null,
347
-
"MBPP": null,
348
-
"LiveCodeBench": 0.659,
349
-
"Aider Polyglot": 0.5329999999999999,
350
-
"SWE-Bench": 0.49200000000000005,
351
-
"SciCode": null,
352
-
"MGSM": null,
353
-
"MMMLU": null,
354
-
"C-Eval or CMMLU": 0.9179999999999999,
355
-
"AraMMLu": null,
356
-
"LongBench": null,
357
-
"RULER 128K": null,
358
-
"RULER 32K": null,
359
-
"MTOB": null,
360
-
"BFCL": null,
361
-
"AgentBench": null,
362
-
"Gorilla Benchmark": null,
363
-
"ToolBench": null,
364
-
"MINT": null,
365
-
"MMMU": null,
366
-
"Mathvista": null,
367
-
"ChartQA": null,
368
-
"DocVQA": null,
369
-
"AI2D": null
370
-
},
371
-
"community_score": {
372
-
"lm_sys_arena_score": 1363,
373
-
"hf_score": 9.5
374
-
},
375
-
"model_specs": {
376
-
"price": 0.55,
377
-
"context_window": 128000,
378
-
"param_count": 685,
379
-
"architecture": "moe"
380
-
}
327
+
"dev_benchmarks": {
328
+
"MMLU": 90.8,
329
+
"MMLU Pro": 84.0,
330
+
"BigBenchHard": null,
331
+
"GPQA diamond": 71.5,
332
+
"DROP": 92.2,
333
+
"HellaSwag": null,
334
+
"Humanity's Last Exam": null,
335
+
"ARC-C": null,
336
+
"Wild Bench": null,
337
+
"MT-bench": null,
338
+
"IFEval": 83.3,
339
+
"Arena-Hard": 92.3,
340
+
"MATH": 97.3,
341
+
"GSM-8K": null,
342
+
"AIME": 79.8,
343
+
"HumanEval": null,
344
+
"MBPP": null,
345
+
"LiveCodeBench": 65.9,
346
+
"Aider Polyglot": 53.3,
347
+
"SWE-Bench": 49.2,
348
+
"SciCode": null,
349
+
"MGSM": null,
350
+
"MMMLU": null,
351
+
"C-Eval or CMMLU": 91.8,
352
+
"AraMMLu": null,
353
+
"LongBench": null,
354
+
"RULER 128K": null,
355
+
"RULER 32K": null,
356
+
"MTOB": null,
357
+
"BFCL": null,
358
+
"AgentBench": null,
359
+
"Gorilla Benchmark": null,
360
+
"ToolBench": null,
361
+
"MINT": null,
362
+
"MMMU": null,
363
+
"Mathvista": null,
364
+
"ChartQA": null,
365
+
"DocVQA": null,
366
+
"AI2D": null
367
+
},
368
+
"community_score": {
369
+
"lm_sys_arena_score": 1389,
370
+
"hf_score": 8.79
371
+
},
372
+
"model_specs": {
373
+
"input_price": 0.55,
374
+
"output_price": 2.19,
375
+
"context_window": 128000,
376
+
"param_count": 685,
377
+
"architecture": "moe"
381
378
}
382
379
}
383
380
```
384
381
385
382
## License
386
383
387
-
This project is licensed under the MIT License - see the [LICENSE.md](https://github.com/LSeu-Open/LLMScoreEngine/blob/main/LICENSE) file for details.
384
+
This project is licensed under the MIT License - see the [LICENSE-CODE.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/LICENSE-CODE.md) file for details.
0 commit comments