diff --git a/backend/.gitignore b/backend/.gitignore index a0bbdf0..364a7af 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -54,3 +54,5 @@ logs/ .vscode/ *.swp *.swo +/owlynews.sqlite-shm +/owlynews.sqlite-wal diff --git a/backend/app/main.py b/backend/app/main.py index 762289e..d00f890 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,3 +1,4 @@ + """ Owly News Summariser Backend @@ -12,6 +13,7 @@ import asyncio import json import os import sqlite3 +import re from contextlib import contextmanager from datetime import datetime, timezone, timedelta from http.client import HTTPException @@ -25,6 +27,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler from fastapi import FastAPI, Response, status, Depends from fastapi.staticfiles import StaticFiles from pydantic import BaseModel +from bs4 import BeautifulSoup # Constants DB_PATH = Path("owlynews.sqlite") @@ -36,6 +39,8 @@ SYNC_COOLDOWN_MINUTES = 30 LLM_MODEL = "qwen2:7b-instruct-q4_K_M" LLM_TIMEOUT_SECONDS = 180 OLLAMA_API_TIMEOUT_SECONDS = 10 +ARTICLE_FETCH_TIMEOUT = 30 +MAX_ARTICLE_LENGTH = 5000 # Max characters from article content # Add logging configuration at the top of your file logging.basicConfig( @@ -279,7 +284,97 @@ class NewsFetcher: """ @staticmethod - def build_prompt(url: str, title: str = "", description: str = "") -> str: + async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str: + """ + Fetch and extract the main content from an article URL. + + Args: + client: An active httpx AsyncClient for making requests + url: URL of the article to fetch + + Returns: + Extracted text content from the article, or empty string if failed + """ + try: + logger.debug(f"🌐 Fetching article content from: {url}") + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + response = await client.get( + url, + headers=headers, + timeout=ARTICLE_FETCH_TIMEOUT, + follow_redirects=True + ) + + response.raise_for_status() + + # Parse HTML content + soup = BeautifulSoup(response.text, 'html.parser') + + # Remove unwanted elements + for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']): + element.decompose() + + # Try to find main content areas + content_selectors = [ + 'article', + '[role="main"]', + '.content', + '.article-content', + '.post-content', + '.entry-content', + '.main-content', + 'main', + '.story-body', + '.article-body' + ] + + article_text = "" + + # Try each selector until we find content + for selector in content_selectors: + elements = soup.select(selector) + if elements: + # Get text from all matching elements + for element in elements: + text = element.get_text(separator=' ', strip=True) + if len(text) > len(article_text): + article_text = text + break + + # Fallback: get text from body if no specific content area found + if not article_text: + body = soup.find('body') + if body: + article_text = body.get_text(separator=' ', strip=True) + + # Clean up the text + article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace + article_text = article_text.strip() + + # Limit length to avoid overwhelming the LLM + if len(article_text) > MAX_ARTICLE_LENGTH: + article_text = article_text[:MAX_ARTICLE_LENGTH] + "..." + logger.debug(f"✂️ Truncated article content to {MAX_ARTICLE_LENGTH} characters") + + logger.debug(f"📄 Extracted {len(article_text)} characters from article") + return article_text + + except httpx.TimeoutException: + logger.warning(f"⏰ Timeout fetching article content from: {url}") + return "" + except httpx.HTTPError as e: + logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}") + return "" + except Exception as e: + logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}") + return "" + + @staticmethod + def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str: """ Generate a prompt for the LLM to summarize an article. @@ -287,6 +382,7 @@ class NewsFetcher: url: Public URL of the article to summarize title: Article title from RSS feed (optional) description: Article description from RSS feed (optional) + content: Extracted article content (optional) Returns: A formatted prompt string that instructs the LLM to generate @@ -294,9 +390,13 @@ class NewsFetcher: """ context_info = [] if title: - context_info.append(f"Titel: {title}") + context_info.append(f"RSS-Titel: {title}") if description: - context_info.append(f"Beschreibung: {description}") + context_info.append(f"RSS-Beschreibung: {description}") + if content: + # Show first part of content for context + content_preview = content[:500] + "..." if len(content) > 500 else content + context_info.append(f"Artikel-Inhalt: {content_preview}") context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar." @@ -306,14 +406,15 @@ class NewsFetcher: f"URL: {url}\n" f"Verfügbare Informationen:\n{context}\n\n" "### Regeln\n" - "1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n" - "2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n" - "3. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n" - "4. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n" - "5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n" - "6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n" - "7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n" - "8. Kein Text vor oder nach dem JSON\n\n" + "1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n" + "2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n" + "3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n" + "4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n" + "5. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n" + "6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n" + "7. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n" + "8. summary_en: Englische Zusammenfassung (max 160 Wörter)\n" + "9. Kein Text vor oder nach dem JSON\n\n" "### Ausgabe\n" "Jetzt antworte mit dem JSON:" ) @@ -327,6 +428,7 @@ class NewsFetcher: ) -> Optional[ArticleSummary]: """ Generate a summary of an article using the LLM. + Now fetches the actual article content for more accurate summaries. Args: client: An active httpx AsyncClient for making requests @@ -342,7 +444,16 @@ class NewsFetcher: logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title") logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description") - prompt = NewsFetcher.build_prompt(url, title, description) + # Fetch article content + logger.debug(f"🌐 Fetching article content...") + article_content = await NewsFetcher.fetch_article_content(client, url) + + if article_content: + logger.info(f"✅ Successfully fetched article content ({len(article_content)} chars)") + else: + logger.warning(f"⚠️ Could not fetch article content, using RSS data only") + + prompt = NewsFetcher.build_prompt(url, title, description, article_content) payload = { "model": LLM_MODEL, "prompt": prompt, @@ -472,6 +583,7 @@ class NewsFetcher: ) -> Dict[str, int]: """ Process a single feed, fetching and summarizing all articles. + Now saves summaries immediately to the database. Args: client: An active httpx AsyncClient for making requests @@ -509,7 +621,7 @@ class NewsFetcher: continue if not hasattr(entry, "published_parsed"): - logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5 + logger.debug(f"⏩ Skipping entry {i}: no published date") stats['skipped'] += 1 continue @@ -557,15 +669,9 @@ class NewsFetcher: published_timestamp = int(published.timestamp()) - # Handle source field - it can be a string or dict - source_value = entry.get("source", feed_row["url"]) - if isinstance(source_value, dict): - source_title = source_value.get("title", feed_row["url"]) - else: - source_title = source_value if source_value else feed_row["url"] - - logger.debug(f"💾 Storing article in database") + logger.debug(f"💾 Storing article in database immediately after summarization") + # Store in database immediately after successful summarization # Store in database try: with db_manager.get_cursor() as cursor: @@ -583,7 +689,7 @@ class NewsFetcher: ) ) - logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...") + logger.info(f"✅ Successfully processed and stored article {i}: {summary['title'][:50]}...") stats['successful'] += 1 except Exception as db_error: diff --git a/backend/requirements.txt b/backend/requirements.txt index 6819464..a95f0f3 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -8,3 +8,4 @@ uvicorn[standard] python-multipart psycopg2-binary sqlalchemy +beautifulsoup4