Skip to content

Commit 90ef8eb

Browse files
authored
Update README.md
1 parent d10d966 commit 90ef8eb

File tree

1 file changed

+76
-79
lines changed

1 file changed

+76
-79
lines changed

README.md

Lines changed: 76 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@ Originally developed within the [AIenhancedWork](https://github.com/LSeu-Open/AI
2828
- **Community score** (20 points max)
2929
- **Technical specifications** (20 points max)
3030

31-
The final score is calculated out of 100 points (if you want to have a detailed breakdown of the scoring framework, please refer to the [scoring_framework.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/Scoring/scoring_framework.md) file).
31+
The final score is calculated out of 100 points (if you want to have a detailed breakdown of the scoring framework, please refer to the [scoring_framework_development_notes.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/Scoring/dev_ideas/scoring_framework_development_notes.md) file).
3232

3333
Please note that this is a beta version and the scoring system is subject to change.
3434

3535
To help us refine and improve LLMScoreEngine during this beta phase, we actively encourage user feedback, bug reports, and contributions to help us refine and improve LLMScoreEngine. Please feel free to [open an issue](https://github.com/LSeu-Open/LLMScoreEngine/issues) or [contribute](CONTRIBUTING.md) to the project. Make sure to respect the [Code of Conduct](CODE_OF_CONDUCT.md).
3636

37-
> [!NOTE]
38-
> Following the v0.5.0 update, which introduced modular configuration capabilities, we are now developing a dedicated version of this evaluation system tailored for Vision Language Models (VLMs). This development is ongoing in its [dedicated Repository](https://github.com/LSeu-Open/VLMScoreEngine).
37+
## New features in Beta v0.6
38+
39+
- Updated technical score calculation to take into account the input and output price of the model.
40+
- Added a new CLI option to generate a graph report from the existing csv report.
3941

4042
## Project Structure
4143

@@ -53,6 +55,8 @@ LLMScoreEngine/
5355
│ │ └── models_scoring.py
5456
│ ├── utils/ # Utility functions
5557
│ │ ├── config_loader.py
58+
│ │ ├── csv_reporter.py
59+
│ │ └── graph_reporter.py
5660
│ │ └── logging.py
5761
│ ├── __init__.py
5862
│ └── run_scoring.py # Script for running scoring programmatically
@@ -67,7 +71,9 @@ LLMScoreEngine/
6771
│ │ ├── test_hf_score.py
6872
│ │ └── test_models_scoring.py
6973
│ ├── utils/
70-
│ │ └── test_config_loader.py
74+
│ │ ├── test_config_loader.py
75+
│ │ └── test_csv_reporter.py
76+
│ │ └── test_graph_reporter.py
7177
│ ├── __init__.py
7278
│ └── test_run_scoring.py
7379
├── LICENSE # Project license file
@@ -203,7 +209,8 @@ Models data should be stored as JSON files in the `Models` directory, with the f
203209
"hf_score": null
204210
},
205211
"model_specs": {
206-
"price": null,
212+
"input_price": null,
213+
"output_price": null,
207214
"context_window": null,
208215
"param_count": null,
209216
"architecture": null
@@ -293,6 +300,7 @@ You can customize the scoring process with the following optional flags:
293300
| `--quiet` | Suppress all informational output and only print the final scores in the console. Useful for scripting. | `python score_models.py --all --quiet` |
294301
| `--config <path>` | Path to a custom Python configuration file to override the default scoring parameters. | `python score_models.py ModelName --config my_config.py` |
295302
| `--csv` | Generate a CSV report from existing results. | `python score_models.py --csv` |
303+
| `--graph` | Generate a graph report from existing csv report. | `python score_models.py --graph` |
296304

297305
### IDE Usage
298306

@@ -304,87 +312,76 @@ Results will be stored as JSON files in the `Results` directory, with the follow
304312

305313
```json
306314
{
307-
"model_name": "Deepseek-R1",
308-
"scores": {
309-
"entity_score": 18.84257142857143,
310-
"dev_score": 23.063999999999997,
311-
"external_score": 41.906571428571425,
312-
"community_score": 16.76,
313-
"technical_score": 16.95878387917363,
314-
"final_score": 75.63,
315-
"avg_performance": 73.21368421052631
315+
"entity_benchmarks": {
316+
"artificial_analysis": 60.22,
317+
"OpenCompass": 86.7,
318+
"LLM Explorer": 59.0,
319+
"Livebench": 72.49,
320+
"open_llm": null,
321+
"UGI Leaderboard": 55.65,
322+
"big_code_bench": 35.1,
323+
"EvalPlus Leaderboard": null,
324+
"Dubesord_LLM": 70.5,
325+
"Open VLM": null
316326
},
317-
"input_data": {
318-
"entity_benchmarks": {
319-
"artificial_analysis": 0.6022,
320-
"OpenCompass": 0.867,
321-
"LLM Explorer": 0.59,
322-
"Livebench": 0.7249,
323-
"open_llm": null,
324-
"UGI Leaderboard": 0.5565,
325-
"big_code_bench": 0.35100000000000003,
326-
"EvalPlus Leaderboard": null,
327-
"Dubesord_LLM": 0.705,
328-
"Open VLM": null
329-
},
330-
"dev_benchmarks": {
331-
"MMLU": 0.9079999999999999,
332-
"MMLU Pro": 0.84,
333-
"BigBenchHard": null,
334-
"GPQA diamond": 0.715,
335-
"DROP": 0.922,
336-
"HellaSwag": null,
337-
"Humanity's Last Exam": null,
338-
"ARC-C": null,
339-
"Wild Bench": null,
340-
"MT-bench": null,
341-
"IFEval": 0.833,
342-
"Arena-Hard": 0.9229999999999999,
343-
"MATH": 0.973,
344-
"GSM-8K": null,
345-
"AIME": 0.7979999999999999,
346-
"HumanEval": null,
347-
"MBPP": null,
348-
"LiveCodeBench": 0.659,
349-
"Aider Polyglot": 0.5329999999999999,
350-
"SWE-Bench": 0.49200000000000005,
351-
"SciCode": null,
352-
"MGSM": null,
353-
"MMMLU": null,
354-
"C-Eval or CMMLU": 0.9179999999999999,
355-
"AraMMLu": null,
356-
"LongBench": null,
357-
"RULER 128K": null,
358-
"RULER 32K": null,
359-
"MTOB": null,
360-
"BFCL": null,
361-
"AgentBench": null,
362-
"Gorilla Benchmark": null,
363-
"ToolBench": null,
364-
"MINT": null,
365-
"MMMU": null,
366-
"Mathvista": null,
367-
"ChartQA": null,
368-
"DocVQA": null,
369-
"AI2D": null
370-
},
371-
"community_score": {
372-
"lm_sys_arena_score": 1363,
373-
"hf_score": 9.5
374-
},
375-
"model_specs": {
376-
"price": 0.55,
377-
"context_window": 128000,
378-
"param_count": 685,
379-
"architecture": "moe"
380-
}
327+
"dev_benchmarks": {
328+
"MMLU": 90.8,
329+
"MMLU Pro": 84.0,
330+
"BigBenchHard": null,
331+
"GPQA diamond": 71.5,
332+
"DROP": 92.2,
333+
"HellaSwag": null,
334+
"Humanity's Last Exam": null,
335+
"ARC-C": null,
336+
"Wild Bench": null,
337+
"MT-bench": null,
338+
"IFEval": 83.3,
339+
"Arena-Hard": 92.3,
340+
"MATH": 97.3,
341+
"GSM-8K": null,
342+
"AIME": 79.8,
343+
"HumanEval": null,
344+
"MBPP": null,
345+
"LiveCodeBench": 65.9,
346+
"Aider Polyglot": 53.3,
347+
"SWE-Bench": 49.2,
348+
"SciCode": null,
349+
"MGSM": null,
350+
"MMMLU": null,
351+
"C-Eval or CMMLU": 91.8,
352+
"AraMMLu": null,
353+
"LongBench": null,
354+
"RULER 128K": null,
355+
"RULER 32K": null,
356+
"MTOB": null,
357+
"BFCL": null,
358+
"AgentBench": null,
359+
"Gorilla Benchmark": null,
360+
"ToolBench": null,
361+
"MINT": null,
362+
"MMMU": null,
363+
"Mathvista": null,
364+
"ChartQA": null,
365+
"DocVQA": null,
366+
"AI2D": null
367+
},
368+
"community_score": {
369+
"lm_sys_arena_score": 1389,
370+
"hf_score": 8.79
371+
},
372+
"model_specs": {
373+
"input_price": 0.55,
374+
"output_price": 2.19,
375+
"context_window": 128000,
376+
"param_count": 685,
377+
"architecture": "moe"
381378
}
382379
}
383380
```
384381

385382
## License
386383

387-
This project is licensed under the MIT License - see the [LICENSE.md](https://github.com/LSeu-Open/LLMScoreEngine/blob/main/LICENSE) file for details.
384+
This project is licensed under the MIT License - see the [LICENSE-CODE.md](https://github.com/LSeu-Open/AIEnhancedWork/blob/main/LICENSE-CODE.md) file for details.
388385

389386
<br>
390387

0 commit comments

Comments
 (0)