enhance: add article content extraction and integrate with summarization process

2025-08-01 18:55:55 +02:00
parent 003b8da4b2
commit 3a1c817381
3 changed files with 131 additions and 22 deletions
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -54,3 +54,5 @@ logs/
 .vscode/
 *.swp
 *.swo
+/owlynews.sqlite-shm
+/owlynews.sqlite-wal
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,3 +1,4 @@
+
 """
 Owly News Summariser Backend

@@ -12,6 +13,7 @@ import asyncio
 import json
 import os
 import sqlite3
+import re
 from contextlib import contextmanager
 from datetime import datetime, timezone, timedelta
 from http.client import HTTPException
@@ -25,6 +27,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from fastapi import FastAPI, Response, status, Depends
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
+from bs4 import BeautifulSoup

 # Constants
 DB_PATH = Path("owlynews.sqlite")
@@ -36,6 +39,8 @@ SYNC_COOLDOWN_MINUTES = 30
 LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
 LLM_TIMEOUT_SECONDS = 180
 OLLAMA_API_TIMEOUT_SECONDS = 10
+ARTICLE_FETCH_TIMEOUT = 30
+MAX_ARTICLE_LENGTH = 5000  # Max characters from article content

 # Add logging configuration at the top of your file
 logging.basicConfig(
@@ -279,7 +284,97 @@ class NewsFetcher:
    """

    @staticmethod
-    def build_prompt(url: str, title: str = "", description: str = "") -> str:
+    async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
+        """
+        Fetch and extract the main content from an article URL.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            url: URL of the article to fetch
+
+        Returns:
+            Extracted text content from the article, or empty string if failed
+        """
+        try:
+            logger.debug(f"🌐 Fetching article content from: {url}")
+
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+
+            response = await client.get(
+                url,
+                headers=headers,
+                timeout=ARTICLE_FETCH_TIMEOUT,
+                follow_redirects=True
+            )
+
+            response.raise_for_status()
+
+            # Parse HTML content
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
+                element.decompose()
+
+            # Try to find main content areas
+            content_selectors = [
+                'article',
+                '[role="main"]',
+                '.content',
+                '.article-content',
+                '.post-content',
+                '.entry-content',
+                '.main-content',
+                'main',
+                '.story-body',
+                '.article-body'
+            ]
+
+            article_text = ""
+
+            # Try each selector until we find content
+            for selector in content_selectors:
+                elements = soup.select(selector)
+                if elements:
+                    # Get text from all matching elements
+                    for element in elements:
+                        text = element.get_text(separator=' ', strip=True)
+                        if len(text) > len(article_text):
+                            article_text = text
+                    break
+
+            # Fallback: get text from body if no specific content area found
+            if not article_text:
+                body = soup.find('body')
+                if body:
+                    article_text = body.get_text(separator=' ', strip=True)
+
+            # Clean up the text
+            article_text = re.sub(r'\s+', ' ', article_text)  # Normalize whitespace
+            article_text = article_text.strip()
+
+            # Limit length to avoid overwhelming the LLM
+            if len(article_text) > MAX_ARTICLE_LENGTH:
+                article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
+                logger.debug(f"✂️ Truncated article content to {MAX_ARTICLE_LENGTH} characters")
+
+            logger.debug(f"📄 Extracted {len(article_text)} characters from article")
+            return article_text
+
+        except httpx.TimeoutException:
+            logger.warning(f"⏰ Timeout fetching article content from: {url}")
+            return ""
+        except httpx.HTTPError as e:
+            logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
+            return ""
+        except Exception as e:
+            logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
+            return ""
+
+    @staticmethod
+    def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
        """
        Generate a prompt for the LLM to summarize an article.

@@ -287,6 +382,7 @@ class NewsFetcher:
            url: Public URL of the article to summarize
            title: Article title from RSS feed (optional)
            description: Article description from RSS feed (optional)
+            content: Extracted article content (optional)

        Returns:
            A formatted prompt string that instructs the LLM to generate
@@ -294,9 +390,13 @@ class NewsFetcher:
        """
        context_info = []
        if title:
-            context_info.append(f"Titel: {title}")
+            context_info.append(f"RSS-Titel: {title}")
        if description:
-            context_info.append(f"Beschreibung: {description}")
+            context_info.append(f"RSS-Beschreibung: {description}")
+        if content:
+            # Show first part of content for context
+            content_preview = content[:500] + "..." if len(content) > 500 else content
+            context_info.append(f"Artikel-Inhalt: {content_preview}")

        context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."

@@ -306,14 +406,15 @@ class NewsFetcher:
            f"URL: {url}\n"
            f"Verfügbare Informationen:\n{context}\n\n"
            "### Regeln\n"
-            "1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n"
-            "2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
-            "3. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
-            "4. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
-            "5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
-            "6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
-            "7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
-            "8. Kein Text vor oder nach dem JSON\n\n"
+            "1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
+            "2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
+            "3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
+            "4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
+            "5. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
+            "6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
+            "7. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
+            "8. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
+            "9. Kein Text vor oder nach dem JSON\n\n"
            "### Ausgabe\n"
            "Jetzt antworte mit dem JSON:"
        )
@@ -327,6 +428,7 @@ class NewsFetcher:
    ) -> Optional[ArticleSummary]:
        """
        Generate a summary of an article using the LLM.
+        Now fetches the actual article content for more accurate summaries.

        Args:
            client: An active httpx AsyncClient for making requests
@@ -342,7 +444,16 @@ class NewsFetcher:
        logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
        logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")

-        prompt = NewsFetcher.build_prompt(url, title, description)
+        # Fetch article content
+        logger.debug(f"🌐 Fetching article content...")
+        article_content = await NewsFetcher.fetch_article_content(client, url)
+
+        if article_content:
+            logger.info(f"✅ Successfully fetched article content ({len(article_content)} chars)")
+        else:
+            logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
+
+        prompt = NewsFetcher.build_prompt(url, title, description, article_content)
        payload = {
            "model": LLM_MODEL,
            "prompt": prompt,
@@ -472,6 +583,7 @@ class NewsFetcher:
    ) -> Dict[str, int]:
        """
        Process a single feed, fetching and summarizing all articles.
+        Now saves summaries immediately to the database.

        Args:
            client: An active httpx AsyncClient for making requests
@@ -509,7 +621,7 @@ class NewsFetcher:
                    continue

                if not hasattr(entry, "published_parsed"):
-                    logger.debug(f"⏩ Skipping entry {i}: no published date")  # TODO: change back to 0.5
+                    logger.debug(f"⏩ Skipping entry {i}: no published date")
                    stats['skipped'] += 1
                    continue

@@ -557,15 +669,9 @@ class NewsFetcher:

                published_timestamp = int(published.timestamp())

-                # Handle source field - it can be a string or dict
-                source_value = entry.get("source", feed_row["url"])
-                if isinstance(source_value, dict):
-                    source_title = source_value.get("title", feed_row["url"])
-                else:
-                    source_title = source_value if source_value else feed_row["url"]
-
-                logger.debug(f"💾 Storing article in database")
+                logger.debug(f"💾 Storing article in database immediately after summarization")

+                # Store in database immediately after successful summarization
                # Store in database
                try:
                    with db_manager.get_cursor() as cursor:
@@ -583,7 +689,7 @@ class NewsFetcher:
                            )
                        )

-                    logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...")
+                    logger.info(f"✅ Successfully processed and stored article {i}: {summary['title'][:50]}...")
                    stats['successful'] += 1

                except Exception as db_error:
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -8,3 +8,4 @@ uvicorn[standard]
 python-multipart
 psycopg2-binary
 sqlalchemy
+beautifulsoup4