From e22f3a627a179914788e8f3c8c9df44a81264ba3 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Fri, 1 Aug 2025 21:57:13 +0200 Subject: [PATCH] refactor: improve database initialization and news fetching structure --- backend/app/main.py | 783 ++++++++++-------- backend/app/schema.sql | 34 + backend/example.env | 25 +- frontend/src/App.vue | 18 +- frontend/src/components/CronSlider.vue | 2 +- frontend/src/components/NewsRefreshButton.vue | 27 + frontend/src/stores/useNews.ts | 45 +- frontend/vite.config.ts | 18 +- 8 files changed, 552 insertions(+), 400 deletions(-) create mode 100644 backend/app/schema.sql create mode 100644 frontend/src/components/NewsRefreshButton.vue diff --git a/backend/app/main.py b/backend/app/main.py index d00f890..9537ad7 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,4 +1,3 @@ - """ Owly News Summariser Backend @@ -9,88 +8,135 @@ an API for the frontend to access the summarized news. The application uses SQLite for data storage and APScheduler for scheduling periodic news harvesting. """ +# Standard library imports import asyncio import json +import logging import os -import sqlite3 import re +import sqlite3 from contextlib import contextmanager -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from http.client import HTTPException from pathlib import Path -from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast -import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union, cast +# Third-party imports import feedparser import httpx from apscheduler.schedulers.asyncio import AsyncIOScheduler -from fastapi import FastAPI, Response, status, Depends +from apscheduler.triggers.interval import IntervalTrigger +from bs4 import BeautifulSoup +from fastapi import Depends, FastAPI, Response, status from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from bs4 import BeautifulSoup -# Constants -DB_PATH = Path("owlynews.sqlite") +DB_PATH = Path(os.getenv("DB_NAME", "owlynews.sqlite3")) OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434") -MIN_CRON_HOURS = 0.1 +MIN_CRON_HOURS = float(os.getenv("MIN_CRON_HOURS", 0.5)) DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS)) CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS) -SYNC_COOLDOWN_MINUTES = 30 -LLM_MODEL = "qwen2:7b-instruct-q4_K_M" -LLM_TIMEOUT_SECONDS = 180 -OLLAMA_API_TIMEOUT_SECONDS = 10 -ARTICLE_FETCH_TIMEOUT = 30 -MAX_ARTICLE_LENGTH = 5000 # Max characters from article content +SYNC_COOLDOWN_MINUTES = int(os.getenv("SYNC_COOLDOWN_MINUTES", 30)) +LLM_MODEL = os.getenv("LLM_MODEL", "qwen2:7b-instruct-q4_K_M") +LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", 180)) +OLLAMA_API_TIMEOUT_SECONDS = int(os.getenv("OLLAMA_API_TIMEOUT_SECONDS", 10)) +ARTICLE_FETCH_TIMEOUT = int(os.getenv("ARTICLE_FETCH_TIMEOUT", 30)) +MAX_ARTICLE_LENGTH = int(os.getenv("MAX_ARTICLE_LENGTH", 5000)) + +def update_constants_from_db(settings_dict): + """ + Update global constants with values from the database settings. + Environment variables take precedence over database settings. + + Args: + settings_dict: Dictionary of settings from the database + """ + global OLLAMA_HOST, MIN_CRON_HOURS, CRON_HOURS, SYNC_COOLDOWN_MINUTES + global LLM_MODEL, LLM_TIMEOUT_SECONDS, OLLAMA_API_TIMEOUT_SECONDS + global ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH + + if 'ollama_host' in settings_dict and os.getenv("OLLAMA_HOST") is None: + OLLAMA_HOST = settings_dict['ollama_host'] + + if 'min_cron_hours' in settings_dict and os.getenv("MIN_CRON_HOURS") is None: + try: + MIN_CRON_HOURS = float(settings_dict['min_cron_hours']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid min_cron_hours value in DB: " + f"{settings_dict['min_cron_hours']}" + ) + + if 'cron_hours' in settings_dict and os.getenv("CRON_HOURS") is None: + try: + cron_hours_value = float(settings_dict['cron_hours']) + CRON_HOURS = max(MIN_CRON_HOURS, cron_hours_value) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid cron_hours value in DB: " + f"{settings_dict['cron_hours']}" + ) + + if 'sync_cooldown_minutes' in settings_dict and os.getenv("SYNC_COOLDOWN_MINUTES") is None: + try: + SYNC_COOLDOWN_MINUTES = int(settings_dict['sync_cooldown_minutes']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid sync_cooldown_minutes value in DB: " + f"{settings_dict['sync_cooldown_minutes']}" + ) + + if 'llm_model' in settings_dict and os.getenv("LLM_MODEL") is None: + LLM_MODEL = settings_dict['llm_model'] + + if 'llm_timeout_seconds' in settings_dict and os.getenv("LLM_TIMEOUT_SECONDS") is None: + try: + LLM_TIMEOUT_SECONDS = int(settings_dict['llm_timeout_seconds']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid llm_timeout_seconds value in DB: " + f"{settings_dict['llm_timeout_seconds']}" + ) + + if 'ollama_api_timeout_seconds' in settings_dict and os.getenv("OLLAMA_API_TIMEOUT_SECONDS") is None: + try: + OLLAMA_API_TIMEOUT_SECONDS = int(settings_dict['ollama_api_timeout_seconds']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid ollama_api_timeout_seconds value in DB: " + f"{settings_dict['ollama_api_timeout_seconds']}" + ) + + if 'article_fetch_timeout' in settings_dict and os.getenv("ARTICLE_FETCH_TIMEOUT") is None: + try: + ARTICLE_FETCH_TIMEOUT = int(settings_dict['article_fetch_timeout']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid article_fetch_timeout value in DB: " + f"{settings_dict['article_fetch_timeout']}" + ) + + if 'max_article_length' in settings_dict and os.getenv("MAX_ARTICLE_LENGTH") is None: + try: + MAX_ARTICLE_LENGTH = int(settings_dict['max_article_length']) + except (ValueError, TypeError): + logger.warning( + f"⚠️ Invalid max_article_length value in DB: " + f"{settings_dict['max_article_length']}" + ) -# Add logging configuration at the top of your file logging.basicConfig( - level=logging.INFO, + level=logging.WARNING, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) -# FastAPI app initialization app = FastAPI( title="Owly News Summariser", description="API for the Owly News Summariser application", version="1.0.0" ) -# Database schema definitions -SCHEMA_SQL = [ - """ - CREATE TABLE IF NOT EXISTS news ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - title TEXT NOT NULL, - description TEXT, - url TEXT NOT NULL, - published TEXT NOT NULL, - country TEXT NOT NULL, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """, - "CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)", - """ - CREATE TABLE IF NOT EXISTS feeds ( - id INTEGER PRIMARY KEY, - country TEXT, - url TEXT UNIQUE NOT NULL - ) - """, - """ - CREATE TABLE IF NOT EXISTS settings ( - key TEXT PRIMARY KEY, - val TEXT NOT NULL - ) - """, - """ - CREATE TABLE IF NOT EXISTS meta ( - key TEXT PRIMARY KEY, - val TEXT NOT NULL - ) - """ -] - class DatabaseManager: """ @@ -109,6 +155,116 @@ class DatabaseManager: self.db_path = db_path self._initialize_db() + def _initialize_db(self) -> None: + """ + Initialize the database by creating tables if they don't exist. + Also seeds initial feeds from seed_feeds.json and settings from global constants. + After initialization, updates global constants with values from the database. + """ + try: + schema_file = Path(__file__).parent / "schema.sql" + if not schema_file.exists(): + logger.error("❌ schema.sql not found") + raise FileNotFoundError("schema.sql not found") + + with open(schema_file, 'r', encoding='utf-8') as f: + schema_sql = f.read() + + with self.get_cursor() as cursor: + statements = [stmt.strip() for stmt in schema_sql.split(';') if stmt.strip()] + for statement in statements: + cursor.execute(statement) + + cursor.execute("SELECT COUNT(*) FROM feeds") + feed_count = cursor.fetchone()[0] + + if feed_count == 0: + self._seed_feeds(cursor) + + cursor.execute("SELECT COUNT(*) FROM settings") + settings_count = cursor.fetchone()[0] + + if settings_count == 0: + self._seed_settings(cursor) + + settings = self.get_all_settings() + update_constants_from_db(settings) + except Exception as e: + logger.error(f"❌ Failed to initialize database: {e}") + raise + + def get_all_settings(self) -> dict: + """ + Retrieve all settings from the database. + + Returns: + Dictionary of settings with key-value pairs + """ + settings = {} + try: + with self.get_cursor(readonly=True) as cursor: + cursor.execute("SELECT key, val FROM settings") + for row in cursor.fetchall(): + settings[row['key']] = row['val'] + return settings + except Exception as e: + logger.error(f"❌ Failed to retrieve settings from database: {e}") + return {} + + def _seed_feeds(self, cursor) -> None: + """ + Seed initial feeds from seed_feeds.json file. + """ + import json + from pathlib import Path + + try: + seed_file = Path(__file__).parent / "seed_feeds.json" + + if not seed_file.exists(): + logger.warning("⚠️ seed_feeds.json not found, skipping feed seeding") + return + + with open(seed_file, 'r', encoding='utf-8') as f: + feeds_data = json.load(f) + + for country, urls in feeds_data.items(): + for url in urls: + cursor.execute( + "INSERT OR IGNORE INTO feeds (country, url) VALUES (?, ?)", + (country, url) + ) + + + except Exception as e: + logger.error(f"❌ Failed to seed feeds: {e}") + + def _seed_settings(self, cursor) -> None: + """ + Seed initial settings from global constants. + """ + try: + settings_data = { + 'ollama_host': OLLAMA_HOST, + 'min_cron_hours': MIN_CRON_HOURS, + 'cron_hours': CRON_HOURS, + 'sync_cooldown_minutes': SYNC_COOLDOWN_MINUTES, + 'llm_model': LLM_MODEL, + 'llm_timeout_seconds': LLM_TIMEOUT_SECONDS, + 'ollama_api_timeout_seconds': OLLAMA_API_TIMEOUT_SECONDS, + 'article_fetch_timeout': ARTICLE_FETCH_TIMEOUT, + 'max_article_length': MAX_ARTICLE_LENGTH + } + + for key, val in settings_data.items(): + cursor.execute( + "INSERT OR IGNORE INTO settings (key, val) VALUES (?, ?)", + (key, str(val)) + ) + + except Exception as e: + logger.error(f"❌ Failed to seed settings: {e}") + def _get_connection(self) -> sqlite3.Connection: """ Create a thread-safe database connection. @@ -118,27 +274,47 @@ class DatabaseManager: """ conn = sqlite3.connect( self.db_path, - check_same_thread=False, # Allow use across threads - timeout=20.0 # Add timeout to prevent deadlocks + check_same_thread=False, + timeout=30.0 ) conn.row_factory = sqlite3.Row conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA busy_timeout=30000") + conn.execute("PRAGMA synchronous=NORMAL") + conn.execute("PRAGMA temp_store=MEMORY") return conn @contextmanager - def get_cursor(self) -> Iterator[sqlite3.Cursor]: + def get_cursor(self, readonly: bool = False) -> Iterator[sqlite3.Cursor]: """ Context manager that provides a database cursor and handles commits and rollbacks. + Args: + readonly: If True, opens connection in readonly mode for better concurrency + Yields: A database cursor for executing SQL statements """ conn = None try: conn = self._get_connection() + + if readonly: + conn.execute("BEGIN DEFERRED") + cursor = conn.cursor() yield cursor - conn.commit() + + if not readonly: + conn.commit() + except sqlite3.OperationalError as e: + if conn: + conn.rollback() + if "database is locked" in str(e).lower(): + logger.warning( + f"⚠️ Database temporarily locked, operation may need retry: {e}" + ) + raise e except Exception as e: if conn: conn.rollback() @@ -147,126 +323,36 @@ class DatabaseManager: if conn: conn.close() - def _initialize_db(self) -> None: + @contextmanager + def get_cursor_with_retry(self, readonly: bool = False, max_retries: int = 3) -> Iterator[sqlite3.Cursor]: """ - Initialize the database schema and default settings. - Creates tables if they don't exist and inserts default values. - """ - logger.info("🗄️ Initializing database...") - - # Create schema - with self.get_cursor() as cursor: - for i, stmt in enumerate(SCHEMA_SQL): - logger.debug(f"📝 Executing schema statement {i+1}/{len(SCHEMA_SQL)}") - cursor.execute(stmt) - - # Add migration for description column if it doesn't exist - try: - cursor.execute("SELECT description FROM news LIMIT 1") - logger.debug("✅ Description column exists") - except sqlite3.OperationalError: - # Column doesn't exist, add it - logger.info("🔧 Adding missing description column to news table...") - cursor.execute("ALTER TABLE news ADD COLUMN description TEXT") - - # Insert initial settings - cursor.execute( - "INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING", - ("cron_hours", str(CRON_HOURS)) - ) - logger.debug("⚙️ Settings initialized") - - # Insert initial metadata - cursor.execute( - "INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING", - ("last_sync", "0") - ) - logger.debug("📊 Metadata initialized") - - # Check current feed count - cursor.execute("SELECT COUNT(*) as count FROM feeds") - feed_count = cursor.fetchone()["count"] - logger.info(f"📡 Current feeds in database: {feed_count}") - - # Seed feeds if none exist - if feed_count == 0: - logger.info("🌱 No feeds found, starting seeding process...") - feeds_added = self._seed_feeds(cursor) # Pass the existing cursor - - # Verify seeding worked - cursor.execute("SELECT COUNT(*) as count FROM feeds") - new_feed_count = cursor.fetchone()["count"] - logger.info(f"📡 Feeds after seeding: {new_feed_count}") - else: - logger.info("📡 Feeds already exist, skipping seeding") - - logger.info("✅ Database initialization complete") - - def _seed_feeds(self, cursor: sqlite3.Cursor) -> int: - """ - Seed the database with initial feeds from the seed_feeds.json file. - Only runs if the feeds table is empty. + Context manager with retry logic for database operations. Args: - cursor: Database cursor to use for operations + readonly: If True, opens connection in readonly mode + max_retries: Maximum number of retry attempts - Returns: - Number of feeds added + Yields: + A database cursor for executing SQL statements """ - logger.info("🌱 Seeding feeds from seed_feeds.json...") - feeds_added = 0 - - try: - seed_path = Path(__file__).with_name("seed_feeds.json") - logger.debug(f"📁 Looking for seed file at: {seed_path}") - - if not seed_path.exists(): - logger.error(f"❌ Seed file not found at: {seed_path}") - return feeds_added - - with open(seed_path, "r") as f: - seed_data = json.load(f) - - logger.debug(f"📄 Loaded seed data: {seed_data}") - - for country, urls in seed_data.items(): - logger.info(f"🌍 Processing {len(urls)} feeds for country: {country}") - for url in urls: - try: - cursor.execute( - "INSERT INTO feeds (country, url) VALUES (?, ?) " - "ON CONFLICT (url) DO NOTHING", - (country, url) - ) - # Check if the insert actually added a row - if cursor.rowcount > 0: - feeds_added += 1 - logger.debug(f"✅ Added feed: {url} ({country})") - else: - logger.debug(f"⏩ Feed already exists: {url} ({country})") - except Exception as e: - logger.error(f"❌ Failed to add feed {url}: {e}") - - logger.info(f"🌱 Seeding complete: {feeds_added} feeds added") - - except json.JSONDecodeError as e: - logger.error(f"❌ Invalid JSON in seed_feeds.json: {e}") - # Re-read file content for error reporting + for attempt in range(max_retries + 1): try: - with open(seed_path, "r") as f: - content = f.read() - logger.error(f"📄 File content causing error: {content}") - except: - logger.error("📄 Could not re-read file for error reporting") - except FileNotFoundError as e: - logger.error(f"❌ Seed file not found: {e}") - except Exception as e: - logger.error(f"❌ Error seeding feeds: {e}") - - return feeds_added + with self.get_cursor(readonly=readonly) as cursor: + yield cursor + return + except sqlite3.OperationalError as e: + if "database is locked" in str(e).lower() and attempt < max_retries: + wait_time = (attempt + 1) * 0.1 + logger.warning( + f"⚠️ Database locked, retrying in {wait_time}s " + f"(attempt {attempt + 1}/{max_retries + 1})" + ) + import time + time.sleep(wait_time) + continue + raise e -# Initialize database manager db_manager = DatabaseManager(DB_PATH) @@ -296,10 +382,10 @@ class NewsFetcher: Extracted text content from the article, or empty string if failed """ try: - logger.debug(f"🌐 Fetching article content from: {url}") - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/91.0.4472.124 Safari/537.36' } response = await client.get( @@ -311,14 +397,11 @@ class NewsFetcher: response.raise_for_status() - # Parse HTML content soup = BeautifulSoup(response.text, 'html.parser') - # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']): element.decompose() - # Try to find main content areas content_selectors = [ 'article', '[role="main"]', @@ -334,11 +417,9 @@ class NewsFetcher: article_text = "" - # Try each selector until we find content for selector in content_selectors: elements = soup.select(selector) if elements: - # Get text from all matching elements for element in elements: text = element.get_text(separator=' ', strip=True) if len(text) > len(article_text): @@ -351,16 +432,13 @@ class NewsFetcher: if body: article_text = body.get_text(separator=' ', strip=True) - # Clean up the text article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace article_text = article_text.strip() # Limit length to avoid overwhelming the LLM if len(article_text) > MAX_ARTICLE_LENGTH: article_text = article_text[:MAX_ARTICLE_LENGTH] + "..." - logger.debug(f"✂️ Truncated article content to {MAX_ARTICLE_LENGTH} characters") - logger.debug(f"📄 Extracted {len(article_text)} characters from article") return article_text except httpx.TimeoutException: @@ -394,7 +472,6 @@ class NewsFetcher: if description: context_info.append(f"RSS-Beschreibung: {description}") if content: - # Show first part of content for context content_preview = content[:500] + "..." if len(content) > 500 else content context_info.append(f"Artikel-Inhalt: {content_preview}") @@ -410,11 +487,10 @@ class NewsFetcher: "2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n" "3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n" "4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n" - "5. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n" + "5. Struktur: {\"title\":\"…\",\"description\":\"…\"}\n" "6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n" - "7. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n" - "8. summary_en: Englische Zusammenfassung (max 160 Wörter)\n" - "9. Kein Text vor oder nach dem JSON\n\n" + "7. description: Deutsche Zusammenfassung (zwischen 100 und 160 Wörter)\n" + "8. Kein Text vor oder nach dem JSON\n\n" "### Ausgabe\n" "Jetzt antworte mit dem JSON:" ) @@ -440,17 +516,9 @@ class NewsFetcher: A dictionary containing the article title and summaries in German and English, or None if summarization failed """ - logger.info(f"🤖 Starting article summarization for: {url}") - logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title") - logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description") - - # Fetch article content - logger.debug(f"🌐 Fetching article content...") article_content = await NewsFetcher.fetch_article_content(client, url) - if article_content: - logger.info(f"✅ Successfully fetched article content ({len(article_content)} chars)") - else: + if not article_content: logger.warning(f"⚠️ Could not fetch article content, using RSS data only") prompt = NewsFetcher.build_prompt(url, title, description, article_content) @@ -458,63 +526,52 @@ class NewsFetcher: "model": LLM_MODEL, "prompt": prompt, "stream": False, - "temperature": 0.3, # Slightly increase creativity + "temperature": 0.1, "format": "json" } try: - logger.debug(f"📤 Sending request to Ollama API with model: {LLM_MODEL}") - start_time = datetime.now() - response = await client.post( f"{OLLAMA_HOST}/api/generate", json=payload, timeout=LLM_TIMEOUT_SECONDS ) - elapsed_time = (datetime.now() - start_time).total_seconds() - logger.info(f"⏱️ Ollama API response received in {elapsed_time:.2f}s") - response.raise_for_status() result = response.json() - - logger.debug(f"📥 Raw Ollama response keys: {list(result.keys())}") - - # Parse the JSON string returned by the LLM llm_response = result["response"] - logger.debug(f"🔍 LLM response type: {type(llm_response)}") - logger.debug(f"🔍 LLM response preview: {str(llm_response)[:200]}...") if isinstance(llm_response, str): - logger.debug("📋 Parsing JSON string response") summary_data = json.loads(llm_response) else: - logger.debug("📋 Using direct dict response") summary_data = llm_response # Validate required fields - required_fields = ["title", "summary_de", "summary_en"] + required_fields = ["title", "description"] missing_fields = [field for field in required_fields if field not in summary_data] if missing_fields: - logger.warning(f"⚠️ Missing required fields in summary: {missing_fields}") + logger.warning( + f"⚠️ Missing required fields in summary: {missing_fields}" + ) return None - # Log summary quality metrics - title_len = len(summary_data.get("title", "")) - de_words = len(summary_data.get("summary_de", "").split()) - en_words = len(summary_data.get("summary_en", "").split()) + # Check summary quality metrics + description = len(summary_data.get("description", "").split()) - logger.info(f"✅ Summary generated - Title: {title_len} chars, DE: {de_words} words, EN: {en_words} words") - - if de_words > 160 or en_words > 160: - logger.warning(f"⚠️ Summary exceeds word limit - DE: {de_words}/160, EN: {en_words}/160") + if description > 160 or description < 100: + logger.warning( + f"⚠️ Summary exceeds word limit - " + f"Description: {description}/160" + ) return cast(ArticleSummary, summary_data) except json.JSONDecodeError as e: logger.error(f"❌ JSON parsing error for {url}: {e}") - logger.error(f"🔍 Raw response that failed to parse: {llm_response[:500]}...") + logger.error( + f"🔍 Raw response that failed to parse: {llm_response[:500]}..." + ) return None except httpx.HTTPError as e: logger.error(f"❌ HTTP error for {url}: {e}") @@ -529,37 +586,24 @@ class NewsFetcher: Fetch articles from all feeds and store summaries in the database. This is the main function that runs periodically to update the news database. """ - logger.info("🚀 Starting scheduled news harvest...") - harvest_start_time = datetime.now() - total_feeds = 0 total_articles = 0 successful_articles = 0 failed_articles = 0 try: - # Get all feeds from the database with db_manager.get_cursor() as cursor: cursor.execute("SELECT country, url FROM feeds") feeds = cursor.fetchall() - total_feeds = len(feeds) - logger.info(f"📡 Found {total_feeds} feeds to process") - - # Process each feed async with httpx.AsyncClient() as client: for i, feed_row in enumerate(feeds, 1): - logger.info(f"📰 Processing feed {i}/{total_feeds}: {feed_row['url']} ({feed_row['country']})") - feed_stats = await NewsFetcher._process_feed(client, feed_row) total_articles += feed_stats['total'] successful_articles += feed_stats['successful'] failed_articles += feed_stats['failed'] - logger.info(f"📊 Feed {i} complete: {feed_stats['successful']}/{feed_stats['total']} articles processed successfully") - - # Update last sync timestamp current_time = int(datetime.now(timezone.utc).timestamp()) with db_manager.get_cursor() as cursor: cursor.execute( @@ -567,11 +611,6 @@ class NewsFetcher: (str(current_time),) ) - harvest_duration = (datetime.now() - harvest_start_time).total_seconds() - - logger.info(f"✅ News harvest completed in {harvest_duration:.2f}s") - logger.info(f"📊 Final stats: {total_feeds} feeds, {successful_articles}/{total_articles} articles processed successfully") - except Exception as e: logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}") raise @@ -583,7 +622,7 @@ class NewsFetcher: ) -> Dict[str, int]: """ Process a single feed, fetching and summarizing all articles. - Now saves summaries immediately to the database. + Now saves summaries immediately to the database with better concurrency. Args: client: An active httpx AsyncClient for making requests @@ -595,7 +634,6 @@ class NewsFetcher: stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0} try: - logger.debug(f"🔍 Parsing RSS feed: {feed_row['url']}") feed_data = feedparser.parse(feed_row["url"]) if hasattr(feed_data, 'bozo') and feed_data.bozo: @@ -604,7 +642,6 @@ class NewsFetcher: logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}") total_entries = len(feed_data.entries) - logger.info(f"📄 Found {total_entries} entries in feed") if total_entries == 0: logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}") @@ -612,46 +649,36 @@ class NewsFetcher: for i, entry in enumerate(feed_data.entries, 1): stats['total'] += 1 - logger.debug(f"📝 Processing article {i}/{total_entries}") - # Skip entries without links or published dates if not hasattr(entry, "link"): - logger.debug(f"⏩ Skipping entry {i}: no link") stats['skipped'] += 1 continue if not hasattr(entry, "published_parsed"): - logger.debug(f"⏩ Skipping entry {i}: no published date") stats['skipped'] += 1 continue article_url = entry.link - logger.debug(f"🔗 Processing article: {article_url}") - # Parse the published date try: published = datetime( *entry.published_parsed[:6], tzinfo=timezone.utc ) - logger.debug(f"📅 Article published: {published}") - except (TypeError, ValueError) as e: - logger.debug(f"⏩ Skipping entry {i}: invalid date - {e}") + except (TypeError, ValueError): stats['skipped'] += 1 continue - # Check if article already exists - with db_manager.get_cursor() as cursor: - cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,)) - if cursor.fetchone(): - logger.debug(f"⏩ Skipping entry {i}: article already exists") - stats['skipped'] += 1 - continue + # Check if article already exists - use readonly connection for better concurrency + try: + with db_manager.get_cursor_with_retry(readonly=True) as cursor: + cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,)) + if cursor.fetchone(): + stats['skipped'] += 1 + continue + except Exception as db_error: + logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}") - # Get article summary - logger.debug(f"🤖 Requesting summary for article {i}") - - # Extract title and description from RSS entry rss_title = getattr(entry, 'title', '') rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '') @@ -669,27 +696,23 @@ class NewsFetcher: published_timestamp = int(published.timestamp()) - logger.debug(f"💾 Storing article in database immediately after summarization") - - # Store in database immediately after successful summarization - # Store in database try: - with db_manager.get_cursor() as cursor: + with db_manager.get_cursor_with_retry(readonly=False) as cursor: cursor.execute( """ - INSERT INTO news (title, description, url, published, country) + INSERT OR IGNORE INTO news + (title, description, url, published, country) VALUES (?, ?, ?, ?, ?) """, ( summary["title"], - summary["summary_de"], + summary["description"], article_url, published_timestamp, feed_row["country"], ) ) - logger.info(f"✅ Successfully processed and stored article {i}: {summary['title'][:50]}...") stats['successful'] += 1 except Exception as db_error: @@ -697,14 +720,14 @@ class NewsFetcher: stats['failed'] += 1 continue + await asyncio.sleep(0.01) # 10ms delay to yield control + except Exception as e: logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}") - logger.info(f"📊 Feed processing complete: {stats['successful']} successful, {stats['failed']} failed, {stats['skipped']} skipped out of {stats['total']} total") return stats -# Initialize scheduler scheduler = AsyncIOScheduler(timezone="UTC") scheduler.add_job( NewsFetcher.harvest_feeds, @@ -712,11 +735,7 @@ scheduler.add_job( hours=CRON_HOURS, id="harvest" ) -print(f"Starting scheduler with {CRON_HOURS} hours interval") scheduler.start() -print("Scheduler started") -print(f"Next run: {scheduler.get_job('harvest').next_run_time}") - # Pydantic models for API requests and responses @@ -759,15 +778,24 @@ class HoursResponse(BaseModel): hours: float -# Dependency for getting a database cursor async def get_db(): """ - Dependency that provides a database cursor. + Dependency that provides a database cursor with retry logic. Yields: A database cursor for executing SQL statements """ - with db_manager.get_cursor() as cursor: + with db_manager.get_cursor_with_retry(readonly=True) as cursor: + yield cursor + +async def get_db_write(): + """ + Dependency that provides a database cursor for write operations with retry logic. + + Yields: + A database cursor for executing SQL statements + """ + with db_manager.get_cursor_with_retry(readonly=False) as cursor: yield cursor @@ -776,11 +804,12 @@ async def get_db(): async def get_news( country: str = "DE", from_: str = "2025-07-01", - to: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"), + to_: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"), db: sqlite3.Cursor = Depends(get_db) ): """ Get news articles filtered by country and date range. + Now optimized for concurrent access while scheduler is running. Args: country: Country code to filter by (default: "DE") @@ -793,19 +822,52 @@ async def get_news( """ try: datetime.fromisoformat(from_) - datetime.fromisoformat(to) - except ValueError: - raise HTTPException(400, "Invalid date format") - finally: + datetime.fromisoformat(to_) + + from_ts = int(datetime.fromisoformat(from_).timestamp()) + to_ts = int(datetime.fromisoformat(to_).timestamp()) + db.execute( """ - SELECT id, title, description, url, published, country, created_at FROM news - WHERE country=? AND published BETWEEN ? AND ? + SELECT id, title, description, url, published, country, created_at + FROM news + WHERE country = ? AND published BETWEEN ? AND ? ORDER BY published DESC + LIMIT 1000 """, - (country, from_, to) + (country, from_ts, to_ts) + ) + + return [dict(row) for row in db.fetchall()] + + except ValueError: + raise HTTPException(400, "Invalid date format. Use ISO format (YYYY-MM-DD)") + except Exception as e: + logger.error(f"❌ Error fetching news: {e}") + raise HTTPException( + 500, "Internal server error while fetching news" + ) + + +@app.get("/feeds", response_model=List[Dict[str, Any]]) +async def list_feeds(db: sqlite3.Cursor = Depends(get_db)): + """ + List all registered news feeds. + + Args: + db: Database cursor dependency + + Returns: + List of feed objects with id, country, and url + """ + try: + db.execute("SELECT * FROM feeds ORDER BY country, url") + return [dict(row) for row in db.fetchall()] + except Exception as e: + logger.error(f"❌ Error fetching feeds: {e}") + raise HTTPException( + 500, "Internal server error while fetching feeds" ) - return [dict(row) for row in db.fetchall()] @app.get("/meta/last-sync", response_model=TimestampResponse) @@ -821,58 +883,16 @@ async def get_last_sync(db: sqlite3.Cursor = Depends(get_db)): """ db.execute("SELECT val FROM meta WHERE key='last_sync'") row = db.fetchone() + if row is None: + import time + return {"ts": int(time.time())} return {"ts": int(row["val"])} -@app.put("/settings/cron", response_model=HoursResponse) -async def set_cron_schedule( - data: CronSettings, - db: sqlite3.Cursor = Depends(get_db) -): - """ - Update the cron schedule for harvesting news. - - Args: - data: New cron settings with hours interval - db: Database cursor dependency - - Returns: - Object containing the updated hours setting - """ - # Ensure minimum interval - hours = max(MIN_CRON_HOURS, data.hours) - - # Update scheduler - scheduler.get_job("harvest").modify(trigger="interval", hours=hours) - - # Update database - db.execute( - "UPDATE settings SET val=? WHERE key='cron_hours'", - (str(hours),) - ) - - return {"hours": hours} - - -@app.get("/feeds", response_model=List[Dict[str, Any]]) -async def list_feeds(db: sqlite3.Cursor = Depends(get_db)): - """ - List all registered news feeds. - - Args: - db: Database cursor dependency - - Returns: - List of feed objects with id, country, and url - """ - db.execute("SELECT * FROM feeds ORDER BY country") - return [dict(row) for row in db.fetchall()] - - @app.post("/feeds", response_model=SuccessResponse) async def add_feed( feed: FeedData, - db: sqlite3.Cursor = Depends(get_db) + db: sqlite3.Cursor = Depends(get_db_write) ): """ Add a new news feed. @@ -884,19 +904,24 @@ async def add_feed( Returns: Success status """ - db.execute( - "INSERT INTO feeds (country, url) VALUES (?, ?) " - "ON CONFLICT (url) DO NOTHING", - (feed.country, feed.url) - ) - - return {"status": "added"} + try: + db.execute( + "INSERT INTO feeds (country, url) VALUES (?, ?) " + "ON CONFLICT (url) DO NOTHING", + (feed.country, feed.url) + ) + return {"status": "added"} + except Exception as e: + logger.error(f"❌ Error adding feed: {e}") + raise HTTPException( + 500, "Internal server error while adding feed" + ) @app.delete("/feeds", response_model=SuccessResponse) async def delete_feed( url: str, - db: sqlite3.Cursor = Depends(get_db) + db: sqlite3.Cursor = Depends(get_db_write) ): """ Delete a news feed by URL. @@ -908,8 +933,14 @@ async def delete_feed( Returns: Success status """ - db.execute("DELETE FROM feeds WHERE url=?", (url,)) - return {"status": "deleted"} + try: + db.execute("DELETE FROM feeds WHERE url=?", (url,)) + return {"status": "deleted"} + except Exception as e: + logger.error(f"❌ Error deleting feed: {e}") + raise HTTPException( + 500, "Internal server error while deleting feed" + ) @app.get("/model/status", response_model=Union[ModelStatus, ErrorResponse]) @@ -923,7 +954,6 @@ async def get_model_status(): """ try: async with httpx.AsyncClient() as client: - # Get model information from Ollama response = await client.get( f"{OLLAMA_HOST}/api/tags", timeout=OLLAMA_API_TIMEOUT_SECONDS @@ -933,7 +963,6 @@ async def get_model_status(): models_data = response.json() models = models_data.get("models", []) - # Check if the current model is available model_available = any( model.get("name") == LLM_MODEL for model in models ) @@ -958,27 +987,79 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)): Returns: Success status or error response if sync was triggered too recently """ - # Check when the last sync was performed db.execute("SELECT val FROM meta WHERE key='last_sync'") row = db.fetchone() last_sync_ts = int(row["val"]) - # Enforce cooldown period now = datetime.now(timezone.utc) last_sync_time = datetime.fromtimestamp(last_sync_ts, timezone.utc) if now - last_sync_time < timedelta(minutes=SYNC_COOLDOWN_MINUTES): return Response( status_code=status.HTTP_429_TOO_MANY_REQUESTS, - content=f"Sync too soon – wait {SYNC_COOLDOWN_MINUTES} min." + content="Sync was triggered too recently. Please wait before triggering again." ) - # Trigger sync in background try: task = asyncio.create_task(NewsFetcher.harvest_feeds()) return {"status": "triggered", "task_id": id(task)} except Exception as e: - raise HTTPException(500, f"Failed to trigger sync: {str(e)}") + raise HTTPException( + 500, f"Failed to trigger sync: {str(e)}" + ) + + +@app.get("/settings/cron", response_model=HoursResponse) +async def get_cron_schedule(db: sqlite3.Cursor = Depends(get_db)): + """ + Get the current cron schedule for harvesting news. + + Args: + db: Database cursor dependency + + Returns: + Object containing the current hours setting + """ + db.execute("SELECT val FROM settings WHERE key='cron_hours'") + row = db.fetchone() + + if row is None: + return {"hours": CRON_HOURS} + + try: + hours = float(row["val"]) + return {"hours": hours} + except (ValueError, TypeError): + return {"hours": CRON_HOURS} + + +@app.post("/settings/cron", response_model=HoursResponse) +async def update_cron_schedule(data: CronSettings, db: sqlite3.Cursor = Depends(get_db_write)): + """ + Update the cron schedule for harvesting news. + + Args: + data: New cron settings with hours interval + db: Database cursor dependency + + Returns: + Object containing the updated hours setting + """ + hours = max(MIN_CRON_HOURS, data.hours) + + scheduler.get_job("harvest").modify(trigger=IntervalTrigger(hours=hours)) + + if os.getenv("CRON_HOURS") is None: + db.execute( + "UPDATE settings SET val=? WHERE key='cron_hours'", + (str(hours),) + ) + + global CRON_HOURS + CRON_HOURS = hours + + return {"hours": hours} + # Mount static frontend frontend_path = os.path.join( diff --git a/backend/app/schema.sql b/backend/app/schema.sql new file mode 100644 index 0000000..7684db2 --- /dev/null +++ b/backend/app/schema.sql @@ -0,0 +1,34 @@ +-- Database schema for Owly News Summariser + +-- News table to store articles +CREATE TABLE IF NOT EXISTS news ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + description TEXT, + url TEXT NOT NULL, + published TIMESTAMP NOT NULL, + country TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Index for faster queries on published date +CREATE INDEX IF NOT EXISTS idx_news_published ON news(published); + +-- Feeds table to store RSS feed sources +CREATE TABLE IF NOT EXISTS feeds ( + id INTEGER PRIMARY KEY, + country TEXT, + url TEXT UNIQUE NOT NULL +); + +-- Settings table for application configuration +CREATE TABLE IF NOT EXISTS settings ( + key TEXT PRIMARY KEY, + val TEXT NOT NULL +); + +-- Meta table for application metadata +CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + val TEXT NOT NULL +); diff --git a/backend/example.env b/backend/example.env index 048d024..a4b1ec0 100644 --- a/backend/example.env +++ b/backend/example.env @@ -1,8 +1,29 @@ # URL for the Ollama service OLLAMA_HOST=http://localhost:11434 -# Interval for scheduled news fetching in hours (minimum: 0.5) +# Interval for scheduled news fetching in hours CRON_HOURS=1 +# Minimum interval for scheduled news fetching in hours +MIN_CRON_HOURS=0.5 + +# Cooldown period in minutes between manual syncs +SYNC_COOLDOWN_MINUTES=30 + +# LLM model to use for summarization +LLM_MODEL=qwen2:7b-instruct-q4_K_M + +# Timeout in seconds for LLM requests +LLM_TIMEOUT_SECONDS=180 + +# Timeout in seconds for Ollama API requests +OLLAMA_API_TIMEOUT_SECONDS=10 + +# Timeout in seconds for article fetching +ARTICLE_FETCH_TIMEOUT=30 + +# Maximum length of article content to process +MAX_ARTICLE_LENGTH=5000 + # SQLite database connection string -DATABASE_URL=sqlite:///./newsdb.sqlite +DB_NAME=owlynews.sqlite3 diff --git a/frontend/src/App.vue b/frontend/src/App.vue index 79166b2..f730d6d 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -4,6 +4,7 @@ import {useNews} from './stores/useNews'; import FeedManager from './components/FeedManager.vue'; import CronSlider from './components/CronSlider.vue'; import SyncButton from './components/SyncButton.vue'; +import NewsRefreshButton from './components/NewsRefreshButton.vue'; import ModelStatus from './components/ModelStatus.vue'; const news = useNews(); @@ -12,31 +13,28 @@ const filters = ref({country: 'DE'}); onMounted(async () => { await news.loadLastSync(); await news.sync(filters.value); + await news.getNews(filters.value); });