Spaces:

cwpkd
/

Todlong

Runtime error

App Files Files Community

Todlong / utils /scraper.py

cwpkd

Create utils/scraper.py

e9f1adf verified 7 months ago

raw

history blame contribute delete

3.82 kB

	# utils/scraper.py
	"""
	Yahoo Finance news scraper
	"""

	import requests
	from bs4 import BeautifulSoup
	import yfinance as yf
	from typing import List, Dict
	import time
	from config import USER_AGENT


	class YahooFinanceScraper:
	"""Scrape news articles from Yahoo Finance"""

	def __init__(self):
	self.headers = {
	'User-Agent': USER_AGENT
	}

	def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]:
	"""
	Get news articles for a specific stock symbol

	Args:
	symbol: Stock ticker symbol (e.g., 'AAPL')
	max_articles: Maximum number of articles to retrieve

	Returns:
	List of dictionaries containing article information
	"""
	try:
	# Use yfinance to get news
	ticker = yf.Ticker(symbol)
	news = ticker.news

	articles = []
	for item in news[:max_articles]:
	article = {
	'title': item.get('title', ''),
	'publisher': item.get('publisher', 'Unknown'),
	'link': item.get('link', ''),
	'publish_time': item.get('providerPublishTime', 0),
	'type': item.get('type', 'STORY'),
	'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else ''
	}

	# Try to get article summary/description
	if 'summary' in item:
	article['summary'] = item['summary']
	else:
	article['summary'] = self._extract_summary(article['link'])

	articles.append(article)
	time.sleep(0.5) # Be polite to the server

	return articles

	except Exception as e:
	print(f"Error fetching news for {symbol}: {str(e)}")
	return []

	def _extract_summary(self, url: str) -> str:
	"""
	Extract article summary from URL

	Args:
	url: Article URL

	Returns:
	Article summary text
	"""
	try:
	response = requests.get(url, headers=self.headers, timeout=10)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Try to find meta description
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	if meta_desc and meta_desc.get('content'):
	return meta_desc['content']

	# Try to find og:description
	og_desc = soup.find('meta', attrs={'property': 'og:description'})
	if og_desc and og_desc.get('content'):
	return og_desc['content']

	# Fallback to first paragraph
	paragraphs = soup.find_all('p')
	if paragraphs:
	for p in paragraphs:
	text = p.get_text().strip()
	if len(text) > 50:
	return text[:300]

	return "No summary available"

	except Exception as e:
	print(f"Error extracting summary: {str(e)}")
	return "Could not extract summary"

	def get_market_news(self, max_articles: int = 10) -> List[Dict]:
	"""
	Get general market news from Yahoo Finance homepage

	Args:
	max_articles: Maximum number of articles to retrieve

	Returns:
	List of dictionaries containing article information
	"""
	# For general market news, use popular index symbols
	return self.get_stock_news("^GSPC", max_articles) # S&P 500