diff --git a/backend/app/main.py b/backend/app/main.py index e690ca7..762289e 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -14,8 +14,10 @@ import os import sqlite3 from contextlib import contextmanager from datetime import datetime, timezone, timedelta +from http.client import HTTPException from pathlib import Path from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast +import logging import feedparser import httpx @@ -27,7 +29,7 @@ from pydantic import BaseModel # Constants DB_PATH = Path("owlynews.sqlite") OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434") -MIN_CRON_HOURS = 0.5 +MIN_CRON_HOURS = 0.1 DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS)) CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS) SYNC_COOLDOWN_MINUTES = 30 @@ -35,6 +37,13 @@ LLM_MODEL = "qwen2:7b-instruct-q4_K_M" LLM_TIMEOUT_SECONDS = 180 OLLAMA_API_TIMEOUT_SECONDS = 10 +# Add logging configuration at the top of your file +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + # FastAPI app initialization app = FastAPI( title="Owly News Summariser", @@ -46,20 +55,19 @@ app = FastAPI( SCHEMA_SQL = [ """ CREATE TABLE IF NOT EXISTS news ( - id TEXT PRIMARY KEY, -- e.g. URL as unique identifier + id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, - summary_de TEXT, - summary_en TEXT, - published INTEGER, -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer - source TEXT, - country TEXT, - source_feed TEXT + description TEXT, + url TEXT NOT NULL, + published TEXT NOT NULL, + country TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """, "CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)", """ CREATE TABLE IF NOT EXISTS feeds ( - id INTEGER PRIMARY KEY, -- auto-increment via rowid + id INTEGER PRIMARY KEY, country TEXT, url TEXT UNIQUE NOT NULL ) @@ -94,23 +102,23 @@ class DatabaseManager: db_path: Path to the SQLite database file """ self.db_path = db_path - self._connection = None self._initialize_db() def _get_connection(self) -> sqlite3.Connection: """ - Get or create a database connection. + Create a thread-safe database connection. Returns: An active SQLite connection """ - if self._connection is None: - self._connection = sqlite3.connect( - self.db_path, - check_same_thread=False - ) - self._connection.row_factory = sqlite3.Row - return self._connection + conn = sqlite3.connect( + self.db_path, + check_same_thread=False, # Allow use across threads + timeout=20.0 # Add timeout to prevent deadlocks + ) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + return conn @contextmanager def get_cursor(self) -> Iterator[sqlite3.Cursor]: @@ -119,70 +127,138 @@ class DatabaseManager: Yields: A database cursor for executing SQL statements - - Example: - ```python - with db_manager.get_cursor() as cursor: - cursor.execute("SELECT * FROM table") - results = cursor.fetchall() - ``` """ - conn = self._get_connection() - cursor = conn.cursor() + conn = None try: + conn = self._get_connection() + cursor = conn.cursor() yield cursor conn.commit() - except Exception: - conn.rollback() - raise + except Exception as e: + if conn: + conn.rollback() + raise e + finally: + if conn: + conn.close() def _initialize_db(self) -> None: """ Initialize the database schema and default settings. Creates tables if they don't exist and inserts default values. """ + logger.info("🗄️ Initializing database...") + # Create schema with self.get_cursor() as cursor: - for stmt in SCHEMA_SQL: + for i, stmt in enumerate(SCHEMA_SQL): + logger.debug(f"📝 Executing schema statement {i+1}/{len(SCHEMA_SQL)}") cursor.execute(stmt) + # Add migration for description column if it doesn't exist + try: + cursor.execute("SELECT description FROM news LIMIT 1") + logger.debug("✅ Description column exists") + except sqlite3.OperationalError: + # Column doesn't exist, add it + logger.info("🔧 Adding missing description column to news table...") + cursor.execute("ALTER TABLE news ADD COLUMN description TEXT") + # Insert initial settings cursor.execute( "INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING", ("cron_hours", str(CRON_HOURS)) ) + logger.debug("⚙️ Settings initialized") # Insert initial metadata cursor.execute( "INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING", ("last_sync", "0") ) + logger.debug("📊 Metadata initialized") + + # Check current feed count + cursor.execute("SELECT COUNT(*) as count FROM feeds") + feed_count = cursor.fetchone()["count"] + logger.info(f"📡 Current feeds in database: {feed_count}") # Seed feeds if none exist - cursor.execute("SELECT COUNT(*) as count FROM feeds") - if cursor.fetchone()["count"] == 0: - self._seed_feeds() + if feed_count == 0: + logger.info("🌱 No feeds found, starting seeding process...") + feeds_added = self._seed_feeds(cursor) # Pass the existing cursor - def _seed_feeds(self) -> None: + # Verify seeding worked + cursor.execute("SELECT COUNT(*) as count FROM feeds") + new_feed_count = cursor.fetchone()["count"] + logger.info(f"📡 Feeds after seeding: {new_feed_count}") + else: + logger.info("📡 Feeds already exist, skipping seeding") + + logger.info("✅ Database initialization complete") + + def _seed_feeds(self, cursor: sqlite3.Cursor) -> int: """ Seed the database with initial feeds from the seed_feeds.json file. Only runs if the feeds table is empty. + + Args: + cursor: Database cursor to use for operations + + Returns: + Number of feeds added """ + logger.info("🌱 Seeding feeds from seed_feeds.json...") + feeds_added = 0 + try: seed_path = Path(__file__).with_name("seed_feeds.json") + logger.debug(f"📁 Looking for seed file at: {seed_path}") + + if not seed_path.exists(): + logger.error(f"❌ Seed file not found at: {seed_path}") + return feeds_added + with open(seed_path, "r") as f: seed_data = json.load(f) - with self.get_cursor() as cursor: - for country, urls in seed_data.items(): - for url in urls: + logger.debug(f"📄 Loaded seed data: {seed_data}") + + for country, urls in seed_data.items(): + logger.info(f"🌍 Processing {len(urls)} feeds for country: {country}") + for url in urls: + try: cursor.execute( "INSERT INTO feeds (country, url) VALUES (?, ?) " "ON CONFLICT (url) DO NOTHING", (country, url) ) - except (FileNotFoundError, json.JSONDecodeError) as e: - print(f"Error seeding feeds: {e}") + # Check if the insert actually added a row + if cursor.rowcount > 0: + feeds_added += 1 + logger.debug(f"✅ Added feed: {url} ({country})") + else: + logger.debug(f"⏩ Feed already exists: {url} ({country})") + except Exception as e: + logger.error(f"❌ Failed to add feed {url}: {e}") + + logger.info(f"🌱 Seeding complete: {feeds_added} feeds added") + + except json.JSONDecodeError as e: + logger.error(f"❌ Invalid JSON in seed_feeds.json: {e}") + # Re-read file content for error reporting + try: + with open(seed_path, "r") as f: + content = f.read() + logger.error(f"📄 File content causing error: {content}") + except: + logger.error("📄 Could not re-read file for error reporting") + except FileNotFoundError as e: + logger.error(f"❌ Seed file not found: {e}") + except Exception as e: + logger.error(f"❌ Error seeding feeds: {e}") + + return feeds_added # Initialize database manager @@ -203,41 +279,51 @@ class NewsFetcher: """ @staticmethod - def build_prompt(url: str) -> str: + def build_prompt(url: str, title: str = "", description: str = "") -> str: """ Generate a prompt for the LLM to summarize an article. Args: url: Public URL of the article to summarize + title: Article title from RSS feed (optional) + description: Article description from RSS feed (optional) Returns: A formatted prompt string that instructs the LLM to generate a JSON response with title and summaries in German and English - - Note: - LLMs like qwen2 don't have native web access; the model will - generate summaries based on its training data and the URL. """ + context_info = [] + if title: + context_info.append(f"Titel: {title}") + if description: + context_info.append(f"Beschreibung: {description}") + + context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar." + return ( "### Aufgabe\n" - f"Du bekommst eine öffentliche URL: {url}\n" + f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n" + f"URL: {url}\n" + f"Verfügbare Informationen:\n{context}\n\n" "### Regeln\n" - "1. **Entnimm den Inhalt nicht automatisch.** " - "Falls dir der Text nicht vorliegt, antworte mit leeren Strings.\n" - "2. Gib ausschließlich **gültiges minifiziertes JSON** zurück – " - "kein Markdown, keine Kommentare.\n" - "3. Struktur:\n" - "{\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n" - "4. summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst.\n" - "5. Kein Text vor oder nach dem JSON.\n" + "1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n" + "2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n" + "3. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n" + "4. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n" + "5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n" + "6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n" + "7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n" + "8. Kein Text vor oder nach dem JSON\n\n" "### Ausgabe\n" - "Jetzt antworte." + "Jetzt antworte mit dem JSON:" ) @staticmethod async def summarize_article( client: httpx.AsyncClient, - url: str + url: str, + title: str = "", + description: str = "" ) -> Optional[ArticleSummary]: """ Generate a summary of an article using the LLM. @@ -245,31 +331,85 @@ class NewsFetcher: Args: client: An active httpx AsyncClient for making requests url: URL of the article to summarize + title: Article title from RSS feed + description: Article description from RSS feed Returns: A dictionary containing the article title and summaries in German and English, or None if summarization failed """ - prompt = NewsFetcher.build_prompt(url) + logger.info(f"🤖 Starting article summarization for: {url}") + logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title") + logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description") + + prompt = NewsFetcher.build_prompt(url, title, description) payload = { "model": LLM_MODEL, "prompt": prompt, "stream": False, - "temperature": 0.2, + "temperature": 0.3, # Slightly increase creativity "format": "json" } try: + logger.debug(f"📤 Sending request to Ollama API with model: {LLM_MODEL}") + start_time = datetime.now() + response = await client.post( f"{OLLAMA_HOST}/api/generate", json=payload, timeout=LLM_TIMEOUT_SECONDS ) + + elapsed_time = (datetime.now() - start_time).total_seconds() + logger.info(f"⏱️ Ollama API response received in {elapsed_time:.2f}s") + response.raise_for_status() result = response.json() - return cast(ArticleSummary, result["response"]) - except (KeyError, ValueError, httpx.HTTPError, json.JSONDecodeError) as e: - print(f"Error summarizing article {url}: {e}") + + logger.debug(f"📥 Raw Ollama response keys: {list(result.keys())}") + + # Parse the JSON string returned by the LLM + llm_response = result["response"] + logger.debug(f"🔍 LLM response type: {type(llm_response)}") + logger.debug(f"🔍 LLM response preview: {str(llm_response)[:200]}...") + + if isinstance(llm_response, str): + logger.debug("📋 Parsing JSON string response") + summary_data = json.loads(llm_response) + else: + logger.debug("📋 Using direct dict response") + summary_data = llm_response + + # Validate required fields + required_fields = ["title", "summary_de", "summary_en"] + missing_fields = [field for field in required_fields if field not in summary_data] + + if missing_fields: + logger.warning(f"⚠️ Missing required fields in summary: {missing_fields}") + return None + + # Log summary quality metrics + title_len = len(summary_data.get("title", "")) + de_words = len(summary_data.get("summary_de", "").split()) + en_words = len(summary_data.get("summary_en", "").split()) + + logger.info(f"✅ Summary generated - Title: {title_len} chars, DE: {de_words} words, EN: {en_words} words") + + if de_words > 160 or en_words > 160: + logger.warning(f"⚠️ Summary exceeds word limit - DE: {de_words}/160, EN: {en_words}/160") + + return cast(ArticleSummary, summary_data) + + except json.JSONDecodeError as e: + logger.error(f"❌ JSON parsing error for {url}: {e}") + logger.error(f"🔍 Raw response that failed to parse: {llm_response[:500]}...") + return None + except httpx.HTTPError as e: + logger.error(f"❌ HTTP error for {url}: {e}") + return None + except Exception as e: + logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}") return None @staticmethod @@ -278,16 +418,35 @@ class NewsFetcher: Fetch articles from all feeds and store summaries in the database. This is the main function that runs periodically to update the news database. """ + logger.info("🚀 Starting scheduled news harvest...") + harvest_start_time = datetime.now() + + total_feeds = 0 + total_articles = 0 + successful_articles = 0 + failed_articles = 0 + try: # Get all feeds from the database with db_manager.get_cursor() as cursor: cursor.execute("SELECT country, url FROM feeds") feeds = cursor.fetchall() + total_feeds = len(feeds) + + logger.info(f"📡 Found {total_feeds} feeds to process") # Process each feed async with httpx.AsyncClient() as client: - for feed_row in feeds: - await NewsFetcher._process_feed(client, feed_row) + for i, feed_row in enumerate(feeds, 1): + logger.info(f"📰 Processing feed {i}/{total_feeds}: {feed_row['url']} ({feed_row['country']})") + + feed_stats = await NewsFetcher._process_feed(client, feed_row) + + total_articles += feed_stats['total'] + successful_articles += feed_stats['successful'] + failed_articles += feed_stats['failed'] + + logger.info(f"📊 Feed {i} complete: {feed_stats['successful']}/{feed_stats['total']} articles processed successfully") # Update last sync timestamp current_time = int(datetime.now(timezone.utc).timestamp()) @@ -296,30 +455,66 @@ class NewsFetcher: "UPDATE meta SET val=? WHERE key='last_sync'", (str(current_time),) ) + + harvest_duration = (datetime.now() - harvest_start_time).total_seconds() + + logger.info(f"✅ News harvest completed in {harvest_duration:.2f}s") + logger.info(f"📊 Final stats: {total_feeds} feeds, {successful_articles}/{total_articles} articles processed successfully") + except Exception as e: - print(f"Error harvesting feeds: {e}") + logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}") + raise @staticmethod async def _process_feed( client: httpx.AsyncClient, feed_row: sqlite3.Row - ) -> None: + ) -> Dict[str, int]: """ Process a single feed, fetching and summarizing all articles. Args: client: An active httpx AsyncClient for making requests feed_row: A database row containing feed information + + Returns: + Dictionary with processing statistics """ + stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0} + try: + logger.debug(f"🔍 Parsing RSS feed: {feed_row['url']}") feed_data = feedparser.parse(feed_row["url"]) - for entry in feed_data.entries: + if hasattr(feed_data, 'bozo') and feed_data.bozo: + logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}") + if hasattr(feed_data, 'bozo_exception'): + logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}") + + total_entries = len(feed_data.entries) + logger.info(f"📄 Found {total_entries} entries in feed") + + if total_entries == 0: + logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}") + return stats + + for i, entry in enumerate(feed_data.entries, 1): + stats['total'] += 1 + logger.debug(f"📝 Processing article {i}/{total_entries}") + # Skip entries without links or published dates - if not hasattr(entry, "link") or not hasattr(entry, "published_parsed"): + if not hasattr(entry, "link"): + logger.debug(f"⏩ Skipping entry {i}: no link") + stats['skipped'] += 1 continue - article_id = entry.link + if not hasattr(entry, "published_parsed"): + logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5 + stats['skipped'] += 1 + continue + + article_url = entry.link + logger.debug(f"🔗 Processing article: {article_url}") # Parse the published date try: @@ -327,39 +522,80 @@ class NewsFetcher: *entry.published_parsed[:6], tzinfo=timezone.utc ) - except (TypeError, ValueError): - # Skip entries with invalid dates + logger.debug(f"📅 Article published: {published}") + except (TypeError, ValueError) as e: + logger.debug(f"⏩ Skipping entry {i}: invalid date - {e}") + stats['skipped'] += 1 continue + # Check if article already exists + with db_manager.get_cursor() as cursor: + cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,)) + if cursor.fetchone(): + logger.debug(f"⏩ Skipping entry {i}: article already exists") + stats['skipped'] += 1 + continue + # Get article summary - summary = await NewsFetcher.summarize_article(client, entry.link) + logger.debug(f"🤖 Requesting summary for article {i}") + + # Extract title and description from RSS entry + rss_title = getattr(entry, 'title', '') + rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '') + + summary = await NewsFetcher.summarize_article( + client, + article_url, + title=rss_title, + description=rss_description + ) + if not summary: + logger.warning(f"❌ Failed to get summary for article {i}: {article_url}") + stats['failed'] += 1 continue + published_timestamp = int(published.timestamp()) + + # Handle source field - it can be a string or dict + source_value = entry.get("source", feed_row["url"]) + if isinstance(source_value, dict): + source_title = source_value.get("title", feed_row["url"]) + else: + source_title = source_value if source_value else feed_row["url"] + + logger.debug(f"💾 Storing article in database") + # Store in database - with db_manager.get_cursor() as cursor: - cursor.execute( - """ - INSERT INTO news ( - id, title, summary_de, summary_en, published, - source, country, source_feed + try: + with db_manager.get_cursor() as cursor: + cursor.execute( + """ + INSERT INTO news (title, description, url, published, country) + VALUES (?, ?, ?, ?, ?) + """, + ( + summary["title"], + summary["summary_de"], + article_url, + published_timestamp, + feed_row["country"], + ) ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT (id) DO NOTHING - """, - ( - article_id, - summary["title"], - summary["summary_de"], - summary["summary_en"], - published.isoformat(), - entry.get("source", {}).get("title", feed_row["url"]), - feed_row["country"], - feed_row["url"], - ) - ) + + logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...") + stats['successful'] += 1 + + except Exception as db_error: + logger.error(f"❌ Database error for article {i}: {db_error}") + stats['failed'] += 1 + continue + except Exception as e: - print(f"Error processing feed {feed_row['url']}: {e}") + logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}") + + logger.info(f"📊 Feed processing complete: {stats['successful']} successful, {stats['failed']} failed, {stats['skipped']} skipped out of {stats['total']} total") + return stats # Initialize scheduler @@ -370,7 +606,11 @@ scheduler.add_job( hours=CRON_HOURS, id="harvest" ) +print(f"Starting scheduler with {CRON_HOURS} hours interval") scheduler.start() +print("Scheduler started") +print(f"Next run: {scheduler.get_job('harvest').next_run_time}") + # Pydantic models for API requests and responses @@ -414,7 +654,7 @@ class HoursResponse(BaseModel): # Dependency for getting a database cursor -def get_db(): +async def get_db(): """ Dependency that provides a database cursor. @@ -445,14 +685,20 @@ async def get_news( Returns: List of news articles matching the criteria """ - db.execute( - """ - SELECT * FROM news - WHERE country=? AND published BETWEEN ? AND ? - ORDER BY published DESC - """, - (country, from_, to) - ) + try: + datetime.fromisoformat(from_) + datetime.fromisoformat(to) + except ValueError: + raise HTTPException(400, "Invalid date format") + finally: + db.execute( + """ + SELECT id, title, description, url, published, country, created_at FROM news + WHERE country=? AND published BETWEEN ? AND ? + ORDER BY published DESC + """, + (country, from_, to) + ) return [dict(row) for row in db.fetchall()] @@ -622,9 +868,11 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)): ) # Trigger sync in background - asyncio.create_task(NewsFetcher.harvest_feeds()) - return {"status": "triggered"} - + try: + task = asyncio.create_task(NewsFetcher.harvest_feeds()) + return {"status": "triggered", "task_id": id(task)} + except Exception as e: + raise HTTPException(500, f"Failed to trigger sync: {str(e)}") # Mount static frontend frontend_path = os.path.join( diff --git a/backend/app/seed_feeds.json b/backend/app/seed_feeds.json index 99e5370..85e5ef3 100644 --- a/backend/app/seed_feeds.json +++ b/backend/app/seed_feeds.json @@ -1,7 +1,6 @@ { "DE": [ - "https://www.tagesschau.de/xml/rss2", - "https://www.spiegel.de/schlagzeilen/tops/index.rss" + "https://www.tagesschau.de/xml/rss2" ], "EU": [ "https://www.euronews.com/rss?level=theme&name=news" diff --git a/frontend/src/App.vue b/frontend/src/App.vue index 3bd7f5d..79166b2 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -37,7 +37,7 @@ onMounted(async () => { }}

{{ a.summary_de }}

{{ a.summary_en }}

- Original → + Original → diff --git a/frontend/src/stores/useNews.ts b/frontend/src/stores/useNews.ts index bf2c03e..b66ef8d 100644 --- a/frontend/src/stores/useNews.ts +++ b/frontend/src/stores/useNews.ts @@ -4,9 +4,10 @@ import {set, get} from 'idb-keyval'; export const useNews = defineStore('news', { state: () => ({ articles: [] as { - id: string, + id: number, published: number, title: string, + url: string, source: string, summary_de: string, summary_en: string