Jellyfish042 commited on
Commit
c0ca9ee
·
1 Parent(s): ffc40f8

update to latest models

Browse files
README.md CHANGED
@@ -29,11 +29,17 @@ license: mit
29
  ## 本地启动
30
 
31
  ```bash
32
- conda run -n torch2 python app.py
33
  ```
34
 
35
  ## 可选预置模型文件名
36
 
37
- - `models/rwkv7-g1d-0.1b-*.pth`
38
- - `models/rwkv7-g1d-0.4b-*.pth`
39
- - `models/rwkv7-g1d-1.5b-*.pth`
 
 
 
 
 
 
 
29
  ## 本地启动
30
 
31
  ```bash
32
+ cmd /c "\"D:\anaconda3\condabin\conda.bat\" activate torch2 && python app.py"
33
  ```
34
 
35
  ## 可选预置模型文件名
36
 
37
+ - `models/rwkv7-*-0.1b-*.pth`
38
+ - `models/rwkv7-*-0.4b-*.pth`
39
+ - `models/rwkv7-*-1.5b-*.pth`
40
+
41
+ 当前首选文件名:
42
+
43
+ - `models/rwkv7-g1d-0.1b-20260129-ctx8192.pth`
44
+ - `models/rwkv7-g1d-0.4b-20260210-ctx8192.pth`
45
+ - `models/rwkv7-g1f-1.5b-20260419-ctx8192.pth`
app.py CHANGED
@@ -26,7 +26,7 @@ REQUIRED_MODEL_SIZES = ["0.1b", "0.4b", "1.5b"] # TEMP: 2.9b disabled due to OO
26
  PREFERRED_MODEL_FILENAMES = {
27
  "0.1b": "rwkv7-g1d-0.1b-20260129-ctx8192.pth",
28
  "0.4b": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
29
- "1.5b": "rwkv7-g1d-1.5b-20260212-ctx8192.pth",
30
  # "2.9b": "rwkv7-g1d-2.9b-20260131-ctx8192.pth", # TEMP: disabled due to OOM
31
  }
32
  DEFAULT_MODEL_A_SIZE = "1.5b"
@@ -68,15 +68,18 @@ def _display_name_from_filename(filename: str) -> str:
68
  size_b = _parse_size_b(filename)
69
  size_text = f"{size_b:.1f}B" if size_b is not None else "Unknown"
70
  family = "RWKV7"
71
- if "g1d" in filename.lower():
72
- family = "RWKV7-G1D"
73
- elif "g1c" in filename.lower():
74
- family = "RWKV7-G1C"
75
  return f"{family}-{size_text}"
76
 
77
 
78
  def _size_to_pattern(size_key: str) -> str:
79
- return f"rwkv7-g1d-{size_key}-*.pth"
 
 
 
 
80
 
81
 
82
  def _extract_date_token(filename: str):
@@ -104,11 +107,18 @@ def _list_repo_files():
104
 
105
 
106
  def _find_remote_filename_for_size(size_key: str, repo_files):
107
- pattern = re.compile(rf"^rwkv7-g1d-{re.escape(size_key)}-.*\.pth$", re.IGNORECASE)
108
  matches = [f for f in repo_files if pattern.match(f)]
109
  return _pick_best_filename(matches)
110
 
111
 
 
 
 
 
 
 
 
112
  def _ensure_model_file(size_key: str, repo_files_cache=None) -> str:
113
  """Ensure one specific size model exists in local models directory.
114
 
@@ -121,26 +131,37 @@ def _ensure_model_file(size_key: str, repo_files_cache=None) -> str:
121
  preferred_path = MODELS_DIR / preferred
122
  if preferred_path.exists():
123
  return str(preferred_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  local_filename = _find_local_filename_for_size(size_key)
126
  if local_filename:
127
  return str(MODELS_DIR / local_filename)
128
 
129
- if repo_files_cache is None:
130
- repo_files_cache = _list_repo_files()
131
-
132
- remote_filename = preferred
133
- if remote_filename is None or remote_filename not in repo_files_cache:
134
- remote_filename = _find_remote_filename_for_size(size_key, repo_files_cache)
135
 
136
  if not remote_filename:
137
  raise RuntimeError(
138
  f"Could not find remote RWKV file for size {size_key} in repo {HF_REPO_ID}."
139
  )
140
 
 
141
  from huggingface_hub import hf_hub_download
142
 
143
- print(f"Downloading missing model {remote_filename} from {HF_REPO_ID} ...")
144
  local_path = hf_hub_download(
145
  repo_id=HF_REPO_ID,
146
  filename=remote_filename,
@@ -256,11 +277,17 @@ def load_precomputed_example():
256
  if html_path.exists() and metadata_path.exists():
257
  import json
258
 
259
- with open(html_path, "r", encoding="utf-8") as f:
260
- _precomputed_html = f.read()
261
  with open(metadata_path, "r", encoding="utf-8") as f:
262
  metadata = json.load(f)
263
- _precomputed_text = metadata.get("example_text", "")
 
 
 
 
 
 
 
 
264
  print(f"Loaded precomputed example ({len(_precomputed_text)} chars)")
265
  return True
266
 
@@ -317,10 +344,11 @@ def get_model_dropdown_choices():
317
  fallback_specs = []
318
  for size_key in REQUIRED_MODEL_SIZES:
319
  preferred = PREFERRED_MODEL_FILENAMES.get(size_key)
320
- fname = preferred if preferred else f"rwkv7-g1d-{size_key}-*.pth"
321
- fallback_specs.append((size_key, fname))
 
322
 
323
- choices = [f"RWKV7-G1D-{s.upper()} ({f})" for s, f in fallback_specs]
324
  value_a = choices[1] if len(choices) > 1 else (choices[0] if choices else None)
325
  value_b = choices[2] if len(choices) > 2 else (choices[0] if choices else None)
326
  return choices, value_a, value_b
 
26
  PREFERRED_MODEL_FILENAMES = {
27
  "0.1b": "rwkv7-g1d-0.1b-20260129-ctx8192.pth",
28
  "0.4b": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
29
+ "1.5b": "rwkv7-g1f-1.5b-20260419-ctx8192.pth",
30
  # "2.9b": "rwkv7-g1d-2.9b-20260131-ctx8192.pth", # TEMP: disabled due to OOM
31
  }
32
  DEFAULT_MODEL_A_SIZE = "1.5b"
 
68
  size_b = _parse_size_b(filename)
69
  size_text = f"{size_b:.1f}B" if size_b is not None else "Unknown"
70
  family = "RWKV7"
71
+ family_match = re.match(r"^(rwkv[0-9a-z]*)-([^-]+)-", filename.lower())
72
+ if family_match:
73
+ family = f"{family_match.group(1).upper()}-{family_match.group(2).upper()}"
 
74
  return f"{family}-{size_text}"
75
 
76
 
77
  def _size_to_pattern(size_key: str) -> str:
78
+ return f"rwkv7-*-{size_key}-*.pth"
79
+
80
+
81
+ def _size_to_regex(size_key: str):
82
+ return re.compile(rf"^rwkv7-[^-]+-{re.escape(size_key)}-.*\.pth$", re.IGNORECASE)
83
 
84
 
85
  def _extract_date_token(filename: str):
 
107
 
108
 
109
  def _find_remote_filename_for_size(size_key: str, repo_files):
110
+ pattern = _size_to_regex(size_key)
111
  matches = [f for f in repo_files if pattern.match(f)]
112
  return _pick_best_filename(matches)
113
 
114
 
115
+ def _expected_precomputed_model_files():
116
+ return {
117
+ "small_model_file": PREFERRED_MODEL_FILENAMES.get(DEFAULT_MODEL_B_SIZE),
118
+ "large_model_file": PREFERRED_MODEL_FILENAMES.get(DEFAULT_MODEL_A_SIZE),
119
+ }
120
+
121
+
122
  def _ensure_model_file(size_key: str, repo_files_cache=None) -> str:
123
  """Ensure one specific size model exists in local models directory.
124
 
 
131
  preferred_path = MODELS_DIR / preferred
132
  if preferred_path.exists():
133
  return str(preferred_path)
134
+ if repo_files_cache is None:
135
+ repo_files_cache = _list_repo_files()
136
+ if preferred in repo_files_cache:
137
+ from huggingface_hub import hf_hub_download
138
+
139
+ print(f"Downloading preferred model {preferred} from {HF_REPO_ID} ...")
140
+ local_path = hf_hub_download(
141
+ repo_id=HF_REPO_ID,
142
+ filename=preferred,
143
+ local_dir=str(MODELS_DIR),
144
+ local_dir_use_symlinks=False,
145
+ )
146
+ return str(Path(local_path).resolve())
147
+
148
+ if repo_files_cache is None:
149
+ repo_files_cache = _list_repo_files()
150
 
151
  local_filename = _find_local_filename_for_size(size_key)
152
  if local_filename:
153
  return str(MODELS_DIR / local_filename)
154
 
155
+ remote_filename = _find_remote_filename_for_size(size_key, repo_files_cache)
 
 
 
 
 
156
 
157
  if not remote_filename:
158
  raise RuntimeError(
159
  f"Could not find remote RWKV file for size {size_key} in repo {HF_REPO_ID}."
160
  )
161
 
162
+ print(f"Downloading missing model {remote_filename} from {HF_REPO_ID} ...")
163
  from huggingface_hub import hf_hub_download
164
 
 
165
  local_path = hf_hub_download(
166
  repo_id=HF_REPO_ID,
167
  filename=remote_filename,
 
277
  if html_path.exists() and metadata_path.exists():
278
  import json
279
 
 
 
280
  with open(metadata_path, "r", encoding="utf-8") as f:
281
  metadata = json.load(f)
282
+ expected_files = _expected_precomputed_model_files()
283
+ for key, expected in expected_files.items():
284
+ if expected and metadata.get(key) != expected:
285
+ print("Precomputed example is stale for the current preferred model set. Run precompute_example.py to refresh it.")
286
+ return False
287
+
288
+ with open(html_path, "r", encoding="utf-8") as f:
289
+ _precomputed_html = f.read()
290
+ _precomputed_text = metadata.get("example_text", "")
291
  print(f"Loaded precomputed example ({len(_precomputed_text)} chars)")
292
  return True
293
 
 
344
  fallback_specs = []
345
  for size_key in REQUIRED_MODEL_SIZES:
346
  preferred = PREFERRED_MODEL_FILENAMES.get(size_key)
347
+ fname = preferred if preferred else _size_to_pattern(size_key)
348
+ display_name = _display_name_from_filename(fname) if preferred else f"RWKV7-{size_key.upper()}"
349
+ fallback_specs.append((display_name, fname))
350
 
351
+ choices = [f"{display_name} ({fname})" for display_name, fname in fallback_specs]
352
  value_a = choices[1] if len(choices) > 1 else (choices[0] if choices else None)
353
  value_b = choices[2] if len(choices) > 2 else (choices[0] if choices else None)
354
  return choices, value_a, value_b
precompute_example.py CHANGED
@@ -27,12 +27,9 @@ SMALL_SIZE_KEY = "0.4b"
27
  LARGE_SIZE_KEY = "1.5b"
28
  PREFERRED = {
29
  "0.4b": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
30
- "1.5b": "rwkv7-g1d-1.5b-20260212-ctx8192.pth",
31
  }
32
 
33
- SMALL_MODEL_NAME = "RWKV7-G1D-0.4B"
34
- LARGE_MODEL_NAME = "RWKV7-G1D-1.5B"
35
-
36
  # Detect device
37
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
38
  DEVICE = "cpu"
@@ -50,6 +47,19 @@ def _pick_best_filename(filenames):
50
  return sorted(filenames, key=lambda x: (_extract_date_token(x), x))[-1]
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def _list_repo_files():
54
  from huggingface_hub import HfApi
55
 
@@ -64,14 +74,27 @@ def resolve_rwkv_model_path(size_key: str) -> str:
64
  if preferred and (MODELS_DIR / preferred).exists():
65
  return str((MODELS_DIR / preferred).resolve())
66
 
67
- pattern = f"rwkv7-g1d-{size_key}-*.pth"
68
- local_matches = [p.name for p in MODELS_DIR.glob(pattern)]
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  local_best = _pick_best_filename(local_matches)
70
  if local_best:
71
  return str((MODELS_DIR / local_best).resolve())
72
 
73
  repo_files = _list_repo_files()
74
- remote_candidates = [f for f in repo_files if re.match(rf"^rwkv7-g1d-{re.escape(size_key)}-.*\.pth$", f, re.IGNORECASE)]
75
 
76
  remote_file = preferred if preferred in remote_candidates else _pick_best_filename(remote_candidates)
77
  if not remote_file:
@@ -126,33 +149,35 @@ def precompute_example():
126
 
127
  print(f"Example text length: {len(example_text)} characters")
128
 
129
- print(f"Resolving {SMALL_MODEL_NAME} model path...")
130
  small_model_path = resolve_rwkv_model_path(SMALL_SIZE_KEY)
 
 
131
 
132
- print(f"Resolving {LARGE_MODEL_NAME} model path...")
133
  large_model_path = resolve_rwkv_model_path(LARGE_SIZE_KEY)
 
 
134
 
135
- print(f"Loading {SMALL_MODEL_NAME}...")
136
  small_model, small_tokenizer = load_rwkv7_model(small_model_path)
137
 
138
- print(f"Loading {LARGE_MODEL_NAME}...")
139
  large_model, large_tokenizer = load_rwkv7_model(large_model_path)
140
 
141
- print(f"Evaluating with {SMALL_MODEL_NAME}...")
142
  result_small = evaluate_rwkv7_single_sample(small_model, small_tokenizer, example_text)
143
- print(f"{SMALL_MODEL_NAME} completed in {result_small['inference_time']:.2f}s")
144
 
145
- print(f"Evaluating with {LARGE_MODEL_NAME}...")
146
  result_large = evaluate_rwkv7_single_sample(large_model, large_tokenizer, example_text)
147
- print(f"{LARGE_MODEL_NAME} completed in {result_large['inference_time']:.2f}s")
148
 
149
  print("Generating visualization...")
150
  html = generate_comparison_html(
151
  text=example_text,
152
  byte_losses_a=result_large["byte_wise_losses"],
153
  byte_losses_b=result_small["byte_wise_losses"],
154
- model_a_name=LARGE_MODEL_NAME,
155
- model_b_name=SMALL_MODEL_NAME,
156
  topk_predictions_a=result_large["top5_predictions"],
157
  topk_predictions_b=result_small["top5_predictions"],
158
  tokenizer_a=result_large["tokenizer"],
 
27
  LARGE_SIZE_KEY = "1.5b"
28
  PREFERRED = {
29
  "0.4b": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
30
+ "1.5b": "rwkv7-g1f-1.5b-20260419-ctx8192.pth",
31
  }
32
 
 
 
 
33
  # Detect device
34
  # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
35
  DEVICE = "cpu"
 
47
  return sorted(filenames, key=lambda x: (_extract_date_token(x), x))[-1]
48
 
49
 
50
+ def _display_name_from_filename(filename: str) -> str:
51
+ family_match = re.match(r"^(rwkv[0-9a-z]*)-([^-]+)-(\d+(?:\.\d+)?)b-", filename.lower())
52
+ if not family_match:
53
+ return filename
54
+ family = f"{family_match.group(1).upper()}-{family_match.group(2).upper()}"
55
+ size_text = f"{float(family_match.group(3)):.1f}B"
56
+ return f"{family}-{size_text}"
57
+
58
+
59
+ def _size_to_regex(size_key: str):
60
+ return re.compile(rf"^rwkv7-[^-]+-{re.escape(size_key)}-.*\.pth$", re.IGNORECASE)
61
+
62
+
63
  def _list_repo_files():
64
  from huggingface_hub import HfApi
65
 
 
74
  if preferred and (MODELS_DIR / preferred).exists():
75
  return str((MODELS_DIR / preferred).resolve())
76
 
77
+ if preferred:
78
+ repo_files = _list_repo_files()
79
+ if preferred in repo_files:
80
+ from huggingface_hub import hf_hub_download
81
+
82
+ print(f"Downloading preferred model {preferred} from {HF_REPO_ID} ...")
83
+ local_path = hf_hub_download(
84
+ repo_id=HF_REPO_ID,
85
+ filename=preferred,
86
+ local_dir=str(MODELS_DIR),
87
+ local_dir_use_symlinks=False,
88
+ )
89
+ return str(Path(local_path).resolve())
90
+
91
+ local_matches = [p.name for p in MODELS_DIR.glob(f"rwkv7-*-{size_key}-*.pth")]
92
  local_best = _pick_best_filename(local_matches)
93
  if local_best:
94
  return str((MODELS_DIR / local_best).resolve())
95
 
96
  repo_files = _list_repo_files()
97
+ remote_candidates = [f for f in repo_files if _size_to_regex(size_key).match(f)]
98
 
99
  remote_file = preferred if preferred in remote_candidates else _pick_best_filename(remote_candidates)
100
  if not remote_file:
 
149
 
150
  print(f"Example text length: {len(example_text)} characters")
151
 
 
152
  small_model_path = resolve_rwkv_model_path(SMALL_SIZE_KEY)
153
+ small_model_name = _display_name_from_filename(Path(small_model_path).name)
154
+ print(f"Resolving {small_model_name} model path...")
155
 
 
156
  large_model_path = resolve_rwkv_model_path(LARGE_SIZE_KEY)
157
+ large_model_name = _display_name_from_filename(Path(large_model_path).name)
158
+ print(f"Resolving {large_model_name} model path...")
159
 
160
+ print(f"Loading {small_model_name}...")
161
  small_model, small_tokenizer = load_rwkv7_model(small_model_path)
162
 
163
+ print(f"Loading {large_model_name}...")
164
  large_model, large_tokenizer = load_rwkv7_model(large_model_path)
165
 
166
+ print(f"Evaluating with {small_model_name}...")
167
  result_small = evaluate_rwkv7_single_sample(small_model, small_tokenizer, example_text)
168
+ print(f"{small_model_name} completed in {result_small['inference_time']:.2f}s")
169
 
170
+ print(f"Evaluating with {large_model_name}...")
171
  result_large = evaluate_rwkv7_single_sample(large_model, large_tokenizer, example_text)
172
+ print(f"{large_model_name} completed in {result_large['inference_time']:.2f}s")
173
 
174
  print("Generating visualization...")
175
  html = generate_comparison_html(
176
  text=example_text,
177
  byte_losses_a=result_large["byte_wise_losses"],
178
  byte_losses_b=result_small["byte_wise_losses"],
179
+ model_a_name=large_model_name,
180
+ model_b_name=small_model_name,
181
  topk_predictions_a=result_large["top5_predictions"],
182
  topk_predictions_b=result_small["top5_predictions"],
183
  tokenizer_a=result_large["tokenizer"],
precomputed/example_metadata.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
3
- "small_model_inference_time": 17.115161657333374,
4
- "large_model_inference_time": 24.418970823287964,
5
  "small_model_compression_rate": 52.34462304489324,
6
- "large_model_compression_rate": 47.62502588510778,
7
  "small_model_file": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
8
- "large_model_file": "rwkv7-g1d-1.5b-20260212-ctx8192.pth"
9
  }
 
1
  {
2
  "example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
3
+ "small_model_inference_time": 16.93763828277588,
4
+ "large_model_inference_time": 27.36628532409668,
5
  "small_model_compression_rate": 52.34462304489324,
6
+ "large_model_compression_rate": 47.41049874148117,
7
  "small_model_file": "rwkv7-g1d-0.4b-20260210-ctx8192.pth",
8
+ "large_model_file": "rwkv7-g1f-1.5b-20260419-ctx8192.pth"
9
  }
precomputed/example_visualization.html CHANGED
The diff for this file is too large to render. See raw diff