| |
| """ |
| Yahoo Finance news scraper |
| """ |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import yfinance as yf |
| from typing import List, Dict |
| import time |
| from config import USER_AGENT |
|
|
|
|
| class YahooFinanceScraper: |
| """Scrape news articles from Yahoo Finance""" |
| |
| def __init__(self): |
| self.headers = { |
| 'User-Agent': USER_AGENT |
| } |
| |
| def get_stock_news(self, symbol: str, max_articles: int = 10) -> List[Dict]: |
| """ |
| Get news articles for a specific stock symbol |
| |
| Args: |
| symbol: Stock ticker symbol (e.g., 'AAPL') |
| max_articles: Maximum number of articles to retrieve |
| |
| Returns: |
| List of dictionaries containing article information |
| """ |
| try: |
| |
| ticker = yf.Ticker(symbol) |
| news = ticker.news |
| |
| articles = [] |
| for item in news[:max_articles]: |
| article = { |
| 'title': item.get('title', ''), |
| 'publisher': item.get('publisher', 'Unknown'), |
| 'link': item.get('link', ''), |
| 'publish_time': item.get('providerPublishTime', 0), |
| 'type': item.get('type', 'STORY'), |
| 'thumbnail': item.get('thumbnail', {}).get('resolutions', [{}])[0].get('url', '') if item.get('thumbnail') else '' |
| } |
| |
| |
| if 'summary' in item: |
| article['summary'] = item['summary'] |
| else: |
| article['summary'] = self._extract_summary(article['link']) |
| |
| articles.append(article) |
| time.sleep(0.5) |
| |
| return articles |
| |
| except Exception as e: |
| print(f"Error fetching news for {symbol}: {str(e)}") |
| return [] |
| |
| def _extract_summary(self, url: str) -> str: |
| """ |
| Extract article summary from URL |
| |
| Args: |
| url: Article URL |
| |
| Returns: |
| Article summary text |
| """ |
| try: |
| response = requests.get(url, headers=self.headers, timeout=10) |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| meta_desc = soup.find('meta', attrs={'name': 'description'}) |
| if meta_desc and meta_desc.get('content'): |
| return meta_desc['content'] |
| |
| |
| og_desc = soup.find('meta', attrs={'property': 'og:description'}) |
| if og_desc and og_desc.get('content'): |
| return og_desc['content'] |
| |
| |
| paragraphs = soup.find_all('p') |
| if paragraphs: |
| for p in paragraphs: |
| text = p.get_text().strip() |
| if len(text) > 50: |
| return text[:300] |
| |
| return "No summary available" |
| |
| except Exception as e: |
| print(f"Error extracting summary: {str(e)}") |
| return "Could not extract summary" |
| |
| def get_market_news(self, max_articles: int = 10) -> List[Dict]: |
| """ |
| Get general market news from Yahoo Finance homepage |
| |
| Args: |
| max_articles: Maximum number of articles to retrieve |
| |
| Returns: |
| List of dictionaries containing article information |
| """ |
| |
| return self.get_stock_news("^GSPC", max_articles) |