deep-reinforce commited on
Commit
ffd781a
·
verified ·
1 Parent(s): ca78039

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +59 -40
README.md CHANGED
@@ -32,59 +32,78 @@ This model card documents **Ornith-1.0-397B**, the lightweight member of the Orn
32
  <div style="font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;width:100%;margin:0 auto;padding:16px 0">
33
  <table style="width:100%;table-layout:fixed;border-collapse:collapse;font-size:13px">
34
  <thead><tr>
35
- <th style="width:28%;padding:10px 7px;text-align:left;font-weight:600;border-bottom:2px solid #FD8E5B;color:#FD8E5B"></th><th style="width:14.4%;padding:10px 7px;text-align:center;font-weight:700;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:14px;background:rgba(253, 142, 91, 0.12)">Ornith-1.0-397B-A3B</th><th style="width:14.4%;padding:10px 7px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:14px;">Qwen3.5-35B-A3B</th><th style="width:14.4%;padding:10px 7px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:14px;">Qwen3.6-35B-A3B</th><th style="width:14.4%;padding:10px 7px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:14px;">Gemma4-31B</th><th style="width:14.4%;padding:10px 7px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:14px;border-left:2px solid rgba(253, 142, 91, 0.55);">Qwen3.5-397B</th></tr></thead>
 
 
 
 
 
 
 
 
36
  <tbody>
37
- <tr><td colspan="5" style="padding:8px 12px;font-weight:600;color:#FD8E5B;border-bottom:1px solid rgba(253, 142, 91, 0.2);background:rgba(253, 142, 91, 0.1)">Agentic Coding</td><td colspan="1" style="border-bottom:1px solid rgba(253, 142, 91, 0.2);background:rgba(253, 142, 91, 0.1);border-left:2px solid rgba(253, 142, 91, 0.55);"></td></tr>
38
  <tr>
39
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">Terminal-Bench 2.1 <sub><small>(Terminus-2)</small></sub></td>
40
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">64.2</td>
41
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">41.4</td>
42
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">52.5</td>
43
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">42.1</td>
44
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">53.5</td>
 
 
45
  </tr>
46
  <tr>
47
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Verified</td>
48
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">75.6</td>
49
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">70.0</td>
50
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">73.4</td>
51
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">52.0</td>
52
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">76.4</td>
 
 
53
  </tr>
54
  <tr>
55
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Pro</td>
56
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">50.4</td>
57
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">44.6</td>
58
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">49.5</td>
59
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">35.7</td>
60
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">51.6</td>
 
 
61
  </tr>
62
  <tr>
63
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Multilingual</td>
64
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">69.3</td>
65
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">60.3</td>
66
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">67.2</td>
67
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">51.7</td>
68
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">69.3</td>
 
 
69
  </tr>
70
  <tr>
71
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">NL2Repo</td>
72
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">34.6</td>
73
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">20.5</td>
74
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">29.4</td>
75
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">15.5</td>
76
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">36.8</td>
 
 
77
  </tr>
78
  <tr>
79
- <td style="padding:7px 7px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">Claw-eval Avg</td>
80
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">69.8</td>
81
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">65.4</td>
82
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">68.7</td>
83
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">48.5</td>
84
- <td style="padding:7px 7px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);border-left:2px solid rgba(253, 142, 91, 0.55);">70.7</td>
 
 
85
  </tr>
86
- </tbody>
87
- </table>
88
  <p style="margin-top:12px;font-size:10px;opacity:0.7">
89
  * Terminal-Bench 2.1: Harbor/Terminus-2, 3h timeout, 32 CPU / 48GB RAM, avg of 5 runs.<br/>
90
  * All SWE-Bench:Mini-SWE-Agent, temp=1.0, top_p=0.95, 200K context window.<br/>
 
32
  <div style="font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;width:100%;margin:0 auto;padding:16px 0">
33
  <table style="width:100%;table-layout:fixed;border-collapse:collapse;font-size:13px">
34
  <thead><tr>
35
+ <th style="width:24%;padding:10px 6px;text-align:left;font-weight:600;border-bottom:2px solid #FD8E5B;color:#FD8E5B"></th>
36
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:700;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;background:rgba(253, 142, 91, 0.12)">Ornith-1.0-397B</th>
37
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">Qwen3.5-397B</th>
38
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">Qwen3.7-Max</th>
39
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">Minimax M3</th>
40
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">DeepSeek-V4-Pro</th>
41
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">Claude Opus 4.7</th>
42
+ <th style="width:10.85%;padding:10px 5px;text-align:center;font-weight:500;border-bottom:2px solid #FD8E5B;color:#FD8E5B;font-size:13px;">Claude Opus 4.8</th>
43
+ </tr></thead>
44
  <tbody>
45
+ <tr><td colspan="8" style="padding:8px 12px;font-weight:600;color:#FD8E5B;border-bottom:1px solid rgba(253, 142, 91, 0.2);background:rgba(253, 142, 91, 0.1)">Agentic Coding</td></tr>
46
  <tr>
47
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">Terminal-Bench 2.1 <sub><small>(Terminus-2)</small></sub></td>
48
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">77.5</td>
49
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">53.5</td>
50
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">73.5</td>
51
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">64</td>
52
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">64</td>
53
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">70.3</td>
54
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">85</td>
55
  </tr>
56
  <tr>
57
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Verified</td>
58
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">82.4</td>
59
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">76.4</td>
60
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">80.4</td>
61
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
62
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">80.6</td>
63
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">80.8</td>
64
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">87.6</td>
65
  </tr>
66
  <tr>
67
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Pro</td>
68
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">62.2</td>
69
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">51.6</td>
70
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">60.6</td>
71
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">59</td>
72
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">55.4</td>
73
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">64.3</td>
74
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">69.2</td>
75
  </tr>
76
  <tr>
77
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">SWE-bench Multilingual</td>
78
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">78.9</td>
79
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">69.3</td>
80
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">78.3</td>
81
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
82
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">76.2</td>
83
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
84
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
85
  </tr>
86
  <tr>
87
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">NL2Repo</td>
88
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">48.2</td>
89
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">36.8</td>
90
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">47.2</td>
91
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">42.1</td>
92
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
93
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
94
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">69.7</td>
95
  </tr>
96
  <tr>
97
+ <td style="padding:7px 5px;padding-left:20px;border-bottom:1px solid rgba(128, 128, 128, 0.15);">Claw-eval Avg</td>
98
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15);font-weight:600;color:#FD8E5B;background:rgba(253, 142, 91, 0.06)">77.1</td>
99
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">70.7</td>
100
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">65.2</td>
101
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
102
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">75.8</td>
103
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">78.2</td>
104
+ <td style="padding:7px 5px;text-align:center;border-bottom:1px solid rgba(128, 128, 128, 0.15)">-</td>
105
  </tr>
106
+ </tbody></table>
 
107
  <p style="margin-top:12px;font-size:10px;opacity:0.7">
108
  * Terminal-Bench 2.1: Harbor/Terminus-2, 3h timeout, 32 CPU / 48GB RAM, avg of 5 runs.<br/>
109
  * All SWE-Bench:Mini-SWE-Agent, temp=1.0, top_p=0.95, 200K context window.<br/>