refactor: modularize database management, models, and services for better structure and maintainability

2025-08-01 22:19:22 +02:00
parent e22f3a627a
commit 0fd2c7a8b6
6 changed files with 810 additions and 777 deletions
--- a/backend/app/services.py
+++ b/backend/app/services.py
@@ -0,0 +1,381 @@
+import asyncio
+import json
+import re
+import sqlite3
+from datetime import datetime, timezone
+from typing import Optional, cast, Dict
+
+import feedparser
+import httpx
+from bs4 import BeautifulSoup
+
+from backend.app.config import ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH, logger, LLM_MODEL, OLLAMA_HOST, \
+    LLM_TIMEOUT_SECONDS
+from backend.app.database import db_manager
+from backend.app.models import ArticleSummary
+
+
+class NewsFetcher:
+    """
+    Handles fetching and summarizing news articles from RSS feeds.
+    Uses Ollama/qwen to generate summaries of articles.
+    """
+
+    @staticmethod
+    async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
+        """
+        Fetch and extract the main content from an article URL.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            url: URL of the article to fetch
+
+        Returns:
+            Extracted text content from the article, or empty string if failed
+        """
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                              'AppleWebKit/537.36 (KHTML, like Gecko) '
+                              'Chrome/91.0.4472.124 Safari/537.36'
+            }
+
+            response = await client.get(
+                url,
+                headers=headers,
+                timeout=ARTICLE_FETCH_TIMEOUT,
+                follow_redirects=True
+            )
+
+            response.raise_for_status()
+
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
+                element.decompose()
+
+            content_selectors = [
+                'article',
+                '[role="main"]',
+                '.content',
+                '.article-content',
+                '.post-content',
+                '.entry-content',
+                '.main-content',
+                'main',
+                '.story-body',
+                '.article-body'
+            ]
+
+            article_text = ""
+
+            for selector in content_selectors:
+                elements = soup.select(selector)
+                if elements:
+                    for element in elements:
+                        text = element.get_text(separator=' ', strip=True)
+                        if len(text) > len(article_text):
+                            article_text = text
+                    break
+
+            # Fallback: get text from body if no specific content area found
+            if not article_text:
+                body = soup.find('body')
+                if body:
+                    article_text = body.get_text(separator=' ', strip=True)
+
+            article_text = re.sub(r'\s+', ' ', article_text)  # Normalize whitespace
+            article_text = article_text.strip()
+
+            # Limit length to avoid overwhelming the LLM
+            if len(article_text) > MAX_ARTICLE_LENGTH:
+                article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
+
+            return article_text
+
+        except httpx.TimeoutException:
+            logger.warning(f"⏰ Timeout fetching article content from: {url}")
+            return ""
+        except httpx.HTTPError as e:
+            logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
+            return ""
+        except Exception as e:
+            logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
+            return ""
+
+    @staticmethod
+    def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
+        """
+        Generate a prompt for the LLM to summarize an article.
+
+        Args:
+            url: Public URL of the article to summarize
+            title: Article title from RSS feed (optional)
+            description: Article description from RSS feed (optional)
+            content: Extracted article content (optional)
+
+        Returns:
+            A formatted prompt string that instructs the LLM to generate
+            a JSON response with title and summaries in German and English
+        """
+        context_info = []
+        if title:
+            context_info.append(f"RSS-Titel: {title}")
+        if description:
+            context_info.append(f"RSS-Beschreibung: {description}")
+        if content:
+            content_preview = content[:500] + "..." if len(content) > 500 else content
+            context_info.append(f"Artikel-Inhalt: {content_preview}")
+
+        context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
+
+        return (
+            "### Aufgabe\n"
+            f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n"
+            f"URL: {url}\n"
+            f"Verfügbare Informationen:\n{context}\n\n"
+            "### Regeln\n"
+            "1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
+            "2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
+            "3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
+            "4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
+            "5. Struktur: {\"title\":\"…\",\"description\":\"…\"}\n"
+            "6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
+            "7. description: Deutsche Zusammenfassung (zwischen 100 und 160 Wörter)\n"
+            "8. Kein Text vor oder nach dem JSON\n\n"
+            "### Ausgabe\n"
+            "Jetzt antworte mit dem JSON:"
+        )
+
+    @staticmethod
+    async def summarize_article(
+            client: httpx.AsyncClient,
+            url: str,
+            title: str = "",
+            description: str = ""
+    ) -> Optional[ArticleSummary]:
+        """
+        Generate a summary of an article using the LLM.
+        Now fetches the actual article content for more accurate summaries.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            url: URL of the article to summarize
+            title: Article title from RSS feed
+            description: Article description from RSS feed
+
+        Returns:
+            A dictionary containing the article title and summaries in German and English,
+            or None if summarization failed
+        """
+        article_content = await NewsFetcher.fetch_article_content(client, url)
+
+        if not article_content:
+            logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
+
+        prompt = NewsFetcher.build_prompt(url, title, description, article_content)
+        payload = {
+            "model": LLM_MODEL,
+            "prompt": prompt,
+            "stream": False,
+            "temperature": 0.1,
+            "format": "json"
+        }
+
+        try:
+            response = await client.post(
+                f"{OLLAMA_HOST}/api/generate",
+                json=payload,
+                timeout=LLM_TIMEOUT_SECONDS
+            )
+
+            response.raise_for_status()
+            result = response.json()
+            llm_response = result["response"]
+
+            if isinstance(llm_response, str):
+                summary_data = json.loads(llm_response)
+            else:
+                summary_data = llm_response
+
+            # Validate required fields
+            required_fields = ["title", "description"]
+            missing_fields = [field for field in required_fields if field not in summary_data]
+
+            if missing_fields:
+                logger.warning(
+                    f"⚠️  Missing required fields in summary: {missing_fields}"
+                )
+                return None
+
+            # Check summary quality metrics
+            description = len(summary_data.get("description", "").split())
+
+            if description > 160 or description < 100:
+                logger.warning(
+                    f"⚠️  Summary exceeds word limit - "
+                    f"Description: {description}/160"
+                )
+
+            return cast(ArticleSummary, summary_data)
+
+        except json.JSONDecodeError as e:
+            logger.error(f"❌ JSON parsing error for {url}: {e}")
+            logger.error(
+                f"🔍 Raw response that failed to parse: {llm_response[:500]}..."
+            )
+            return None
+        except httpx.HTTPError as e:
+            logger.error(f"❌ HTTP error for {url}: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
+            return None
+
+    @staticmethod
+    async def harvest_feeds() -> None:
+        """
+        Fetch articles from all feeds and store summaries in the database.
+        This is the main function that runs periodically to update the news database.
+        """
+
+        total_articles = 0
+        successful_articles = 0
+        failed_articles = 0
+
+        try:
+            with db_manager.get_cursor() as cursor:
+                cursor.execute("SELECT country, url FROM feeds")
+                feeds = cursor.fetchall()
+
+            async with httpx.AsyncClient() as client:
+                for i, feed_row in enumerate(feeds, 1):
+                    feed_stats = await NewsFetcher._process_feed(client, feed_row)
+
+                    total_articles += feed_stats['total']
+                    successful_articles += feed_stats['successful']
+                    failed_articles += feed_stats['failed']
+
+            current_time = int(datetime.now(timezone.utc).timestamp())
+            with db_manager.get_cursor() as cursor:
+                cursor.execute(
+                    "UPDATE meta SET val=? WHERE key='last_sync'",
+                    (str(current_time),)
+                )
+
+        except Exception as e:
+            logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
+            raise
+
+    @staticmethod
+    async def _process_feed(
+            client: httpx.AsyncClient,
+            feed_row: sqlite3.Row
+    ) -> Dict[str, int]:
+        """
+        Process a single feed, fetching and summarizing all articles.
+        Now saves summaries immediately to the database with better concurrency.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            feed_row: A database row containing feed information
+
+        Returns:
+            Dictionary with processing statistics
+        """
+        stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0}
+
+        try:
+            feed_data = feedparser.parse(feed_row["url"])
+
+            if hasattr(feed_data, 'bozo') and feed_data.bozo:
+                logger.warning(f"⚠️  Feed has parsing issues: {feed_row['url']}")
+                if hasattr(feed_data, 'bozo_exception'):
+                    logger.warning(f"⚠️  Feed exception: {feed_data.bozo_exception}")
+
+            total_entries = len(feed_data.entries)
+
+            if total_entries == 0:
+                logger.warning(f"⚠️  No entries found in feed: {feed_row['url']}")
+                return stats
+
+            for i, entry in enumerate(feed_data.entries, 1):
+                stats['total'] += 1
+
+                if not hasattr(entry, "link"):
+                    stats['skipped'] += 1
+                    continue
+
+                if not hasattr(entry, "published_parsed"):
+                    stats['skipped'] += 1
+                    continue
+
+                article_url = entry.link
+
+                try:
+                    published = datetime(
+                        *entry.published_parsed[:6],
+                        tzinfo=timezone.utc
+                    )
+                except (TypeError, ValueError):
+                    stats['skipped'] += 1
+                    continue
+
+                # Check if article already exists - use readonly connection for better concurrency
+                try:
+                    with db_manager.get_cursor_with_retry(readonly=True) as cursor:
+                        cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
+                        if cursor.fetchone():
+                            stats['skipped'] += 1
+                            continue
+                except Exception as db_error:
+                    logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
+
+                rss_title = getattr(entry, 'title', '')
+                rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
+
+                summary = await NewsFetcher.summarize_article(
+                    client,
+                    article_url,
+                    title=rss_title,
+                    description=rss_description
+                )
+
+                if not summary:
+                    logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
+                    stats['failed'] += 1
+                    continue
+
+                published_timestamp = int(published.timestamp())
+
+                try:
+                    with db_manager.get_cursor_with_retry(readonly=False) as cursor:
+                        cursor.execute(
+                            """
+                            INSERT
+                            OR IGNORE INTO news 
+                            (title, description, url, published, country)
+                            VALUES (?, ?, ?, ?, ?)
+                            """,
+                            (
+                                summary["title"],
+                                summary["description"],
+                                article_url,
+                                published_timestamp,
+                                feed_row["country"],
+                            )
+                        )
+
+                    stats['successful'] += 1
+
+                except Exception as db_error:
+                    logger.error(f"❌ Database error for article {i}: {db_error}")
+                    stats['failed'] += 1
+                    continue
+
+                await asyncio.sleep(0.01)  # 10ms delay to yield control
+
+        except Exception as e:
+            logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
+
+        return stats