Spaces:
Running on Zero
Running on Zero
GitHub Actions
feat: WebLLM browser agent with PeerJS mesh, HybridRAG, news signals, and easter-egg ticker
78cc96f | // src/news/ingest.js | |
| // Browser-only RSS/Atom ingestion. Feeds usually block cross-origin reads, so | |
| // we route through public CORS proxies (with fallback) and parse with DOMParser. | |
| // CORS proxies, best-first. allorigins `/get` returns { contents } with an | |
| // Access-Control-Allow-Origin header; r.jina.ai returns readable text. Both | |
| // are CORS-enabled. allorigins rate-limits bursts, so callers throttle. | |
| const PROXIES = [ | |
| { build: (u) => `https://api.allorigins.win/get?url=${encodeURIComponent(u)}`, json: true }, | |
| { build: (u) => `https://r.jina.ai/${u}` }, | |
| ]; | |
| async function fetchText(url, { signal } = {}) { | |
| let lastErr = null; | |
| for (const p of PROXIES) { | |
| try { | |
| const res = await fetch(p.build(url), { signal }); | |
| if (!res.ok) throw new Error(`HTTP ${res.status}`); | |
| let text; | |
| if (p.json) { | |
| const data = await res.json(); | |
| text = data.contents || ""; | |
| } else { | |
| text = await res.text(); | |
| } | |
| if (text && text.length > 50) return text; | |
| } catch (err) { | |
| lastErr = err; | |
| } | |
| } | |
| throw lastErr || new Error("All proxies failed"); | |
| } | |
| function text(node, sel) { | |
| const el = node.querySelector(sel); | |
| return el ? el.textContent.trim() : ""; | |
| } | |
| function parseFeed(xml, sourceName) { | |
| const doc = new DOMParser().parseFromString(xml, "text/xml"); | |
| if (doc.querySelector("parsererror")) return []; | |
| // RSS <item> or Atom <entry> | |
| const nodes = [...doc.querySelectorAll("item"), ...doc.querySelectorAll("entry")]; | |
| return nodes.map((n) => { | |
| const title = text(n, "title"); | |
| let link = text(n, "link"); | |
| if (!link) { | |
| const a = n.querySelector("link"); | |
| link = a?.getAttribute("href") || ""; | |
| } | |
| const summary = text(n, "description") || text(n, "summary") || text(n, "content"); | |
| const published = text(n, "pubDate") || text(n, "published") || text(n, "updated"); | |
| return { | |
| title, | |
| link, | |
| summary: summary.replace(/<[^>]+>/g, "").slice(0, 400), | |
| source: sourceName, | |
| published, | |
| ts: published ? Date.parse(published) || Date.now() : Date.now(), | |
| }; | |
| }); | |
| } | |
| // Ingest a list of sources. Returns merged, de-duped, newest-first items. | |
| // Fetches are throttled (small concurrent pool) so shared proxies don't | |
| // rate-limit us and drop the CORS header on error responses. | |
| export async function ingestSources(sources, { signal, onSource, concurrency = 3 } = {}) { | |
| const queue = [...sources]; | |
| const collected = []; | |
| async function worker() { | |
| while (queue.length) { | |
| const s = queue.shift(); | |
| try { | |
| const xml = await fetchText(s.url, { signal }); | |
| const items = parseFeed(xml, s.name).map((it) => ({ ...it, cat: s.cat })); | |
| collected.push(...items); | |
| onSource?.({ source: s.name, count: items.length }); | |
| } catch { | |
| onSource?.({ source: s.name, count: 0 }); | |
| } | |
| } | |
| } | |
| await Promise.all(Array.from({ length: Math.min(concurrency, sources.length) }, worker)); | |
| const merged = []; | |
| const seen = new Set(); | |
| for (const it of collected) { | |
| const key = it.link || it.title; | |
| if (!key || seen.has(key)) continue; | |
| seen.add(key); | |
| merged.push(it); | |
| } | |
| merged.sort((a, b) => b.ts - a.ts); | |
| return merged; | |
| } | |
| // Generic readable-text scrape via r.jina.ai (CORS-friendly, returns markdown). | |
| export async function scrapeUrl(url, { signal } = {}) { | |
| const res = await fetch(`https://r.jina.ai/${url}`, { signal }); | |
| if (!res.ok) throw new Error(`scrape failed: HTTP ${res.status}`); | |
| return (await res.text()).slice(0, 12000); | |
| } | |
| // Web search via Google News RSS (good for current events; CORS-proxied). | |
| export async function webSearchNews(query, { signal } = {}) { | |
| const url = `https://news.google.com/rss/search?q=${encodeURIComponent(query)}&hl=en-US&gl=US&ceid=US:en`; | |
| const xml = await fetchText(url, { signal }); | |
| const items = parseFeed(xml, "Google News").slice(0, 8); | |
| if (!items.length) return "No results."; | |
| return items.map((it, i) => `${i + 1}. ${it.title}\n ${it.link}`).join("\n"); | |
| } | |