enhance: improve news fetching, processing, and logging architecture

This commit is contained in:
2025-08-01 18:42:34 +02:00
parent eed5f4afbb
commit 003b8da4b2
4 changed files with 356 additions and 108 deletions

View File

@@ -14,8 +14,10 @@ import os
import sqlite3
from contextlib import contextmanager
from datetime import datetime, timezone, timedelta
from http.client import HTTPException
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast
import logging
import feedparser
import httpx
@@ -27,7 +29,7 @@ from pydantic import BaseModel
# Constants
DB_PATH = Path("owlynews.sqlite")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
MIN_CRON_HOURS = 0.5
MIN_CRON_HOURS = 0.1
DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
SYNC_COOLDOWN_MINUTES = 30
@@ -35,6 +37,13 @@ LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
LLM_TIMEOUT_SECONDS = 180
OLLAMA_API_TIMEOUT_SECONDS = 10
# Add logging configuration at the top of your file
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# FastAPI app initialization
app = FastAPI(
title="Owly News Summariser",
@@ -46,20 +55,19 @@ app = FastAPI(
SCHEMA_SQL = [
"""
CREATE TABLE IF NOT EXISTS news (
id TEXT PRIMARY KEY, -- e.g. URL as unique identifier
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
summary_de TEXT,
summary_en TEXT,
published INTEGER, -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer
source TEXT,
country TEXT,
source_feed TEXT
description TEXT,
url TEXT NOT NULL,
published TEXT NOT NULL,
country TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""",
"CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)",
"""
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY, -- auto-increment via rowid
id INTEGER PRIMARY KEY,
country TEXT,
url TEXT UNIQUE NOT NULL
)
@@ -94,23 +102,23 @@ class DatabaseManager:
db_path: Path to the SQLite database file
"""
self.db_path = db_path
self._connection = None
self._initialize_db()
def _get_connection(self) -> sqlite3.Connection:
"""
Get or create a database connection.
Create a thread-safe database connection.
Returns:
An active SQLite connection
"""
if self._connection is None:
self._connection = sqlite3.connect(
self.db_path,
check_same_thread=False
)
self._connection.row_factory = sqlite3.Row
return self._connection
conn = sqlite3.connect(
self.db_path,
check_same_thread=False, # Allow use across threads
timeout=20.0 # Add timeout to prevent deadlocks
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
return conn
@contextmanager
def get_cursor(self) -> Iterator[sqlite3.Cursor]:
@@ -119,70 +127,138 @@ class DatabaseManager:
Yields:
A database cursor for executing SQL statements
Example:
```python
with db_manager.get_cursor() as cursor:
cursor.execute("SELECT * FROM table")
results = cursor.fetchall()
```
"""
conn = self._get_connection()
cursor = conn.cursor()
conn = None
try:
conn = self._get_connection()
cursor = conn.cursor()
yield cursor
conn.commit()
except Exception:
conn.rollback()
raise
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def _initialize_db(self) -> None:
"""
Initialize the database schema and default settings.
Creates tables if they don't exist and inserts default values.
"""
logger.info("🗄️ Initializing database...")
# Create schema
with self.get_cursor() as cursor:
for stmt in SCHEMA_SQL:
for i, stmt in enumerate(SCHEMA_SQL):
logger.debug(f"📝 Executing schema statement {i+1}/{len(SCHEMA_SQL)}")
cursor.execute(stmt)
# Add migration for description column if it doesn't exist
try:
cursor.execute("SELECT description FROM news LIMIT 1")
logger.debug("✅ Description column exists")
except sqlite3.OperationalError:
# Column doesn't exist, add it
logger.info("🔧 Adding missing description column to news table...")
cursor.execute("ALTER TABLE news ADD COLUMN description TEXT")
# Insert initial settings
cursor.execute(
"INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
("cron_hours", str(CRON_HOURS))
)
logger.debug("⚙️ Settings initialized")
# Insert initial metadata
cursor.execute(
"INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
("last_sync", "0")
)
logger.debug("📊 Metadata initialized")
# Check current feed count
cursor.execute("SELECT COUNT(*) as count FROM feeds")
feed_count = cursor.fetchone()["count"]
logger.info(f"📡 Current feeds in database: {feed_count}")
# Seed feeds if none exist
cursor.execute("SELECT COUNT(*) as count FROM feeds")
if cursor.fetchone()["count"] == 0:
self._seed_feeds()
if feed_count == 0:
logger.info("🌱 No feeds found, starting seeding process...")
feeds_added = self._seed_feeds(cursor) # Pass the existing cursor
def _seed_feeds(self) -> None:
# Verify seeding worked
cursor.execute("SELECT COUNT(*) as count FROM feeds")
new_feed_count = cursor.fetchone()["count"]
logger.info(f"📡 Feeds after seeding: {new_feed_count}")
else:
logger.info("📡 Feeds already exist, skipping seeding")
logger.info("✅ Database initialization complete")
def _seed_feeds(self, cursor: sqlite3.Cursor) -> int:
"""
Seed the database with initial feeds from the seed_feeds.json file.
Only runs if the feeds table is empty.
Args:
cursor: Database cursor to use for operations
Returns:
Number of feeds added
"""
logger.info("🌱 Seeding feeds from seed_feeds.json...")
feeds_added = 0
try:
seed_path = Path(__file__).with_name("seed_feeds.json")
logger.debug(f"📁 Looking for seed file at: {seed_path}")
if not seed_path.exists():
logger.error(f"❌ Seed file not found at: {seed_path}")
return feeds_added
with open(seed_path, "r") as f:
seed_data = json.load(f)
with self.get_cursor() as cursor:
for country, urls in seed_data.items():
for url in urls:
logger.debug(f"📄 Loaded seed data: {seed_data}")
for country, urls in seed_data.items():
logger.info(f"🌍 Processing {len(urls)} feeds for country: {country}")
for url in urls:
try:
cursor.execute(
"INSERT INTO feeds (country, url) VALUES (?, ?) "
"ON CONFLICT (url) DO NOTHING",
(country, url)
)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Error seeding feeds: {e}")
# Check if the insert actually added a row
if cursor.rowcount > 0:
feeds_added += 1
logger.debug(f"✅ Added feed: {url} ({country})")
else:
logger.debug(f"⏩ Feed already exists: {url} ({country})")
except Exception as e:
logger.error(f"❌ Failed to add feed {url}: {e}")
logger.info(f"🌱 Seeding complete: {feeds_added} feeds added")
except json.JSONDecodeError as e:
logger.error(f"❌ Invalid JSON in seed_feeds.json: {e}")
# Re-read file content for error reporting
try:
with open(seed_path, "r") as f:
content = f.read()
logger.error(f"📄 File content causing error: {content}")
except:
logger.error("📄 Could not re-read file for error reporting")
except FileNotFoundError as e:
logger.error(f"❌ Seed file not found: {e}")
except Exception as e:
logger.error(f"❌ Error seeding feeds: {e}")
return feeds_added
# Initialize database manager
@@ -203,41 +279,51 @@ class NewsFetcher:
"""
@staticmethod
def build_prompt(url: str) -> str:
def build_prompt(url: str, title: str = "", description: str = "") -> str:
"""
Generate a prompt for the LLM to summarize an article.
Args:
url: Public URL of the article to summarize
title: Article title from RSS feed (optional)
description: Article description from RSS feed (optional)
Returns:
A formatted prompt string that instructs the LLM to generate
a JSON response with title and summaries in German and English
Note:
LLMs like qwen2 don't have native web access; the model will
generate summaries based on its training data and the URL.
"""
context_info = []
if title:
context_info.append(f"Titel: {title}")
if description:
context_info.append(f"Beschreibung: {description}")
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
return (
"### Aufgabe\n"
f"Du bekommst eine öffentliche URL: {url}\n"
f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n"
f"URL: {url}\n"
f"Verfügbare Informationen:\n{context}\n\n"
"### Regeln\n"
"1. **Entnimm den Inhalt nicht automatisch.** "
"Falls dir der Text nicht vorliegt, antworte mit leeren Strings.\n"
"2. Gib ausschließlich **gültiges minifiziertes JSON** zurück "
"kein Markdown, keine Kommentare.\n"
"3. Struktur:\n"
"{\"title\":\"\",\"summary_de\":\"\",\"summary_en\":\"\"}\n"
"4. summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst.\n"
"5. Kein Text vor oder nach dem JSON.\n"
"1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n"
"2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
"3. Gib ausschließlich **gültiges minifiziertes JSON** zurück kein Markdown, keine Kommentare\n"
"4. Struktur: {\"title\":\"\",\"summary_de\":\"\",\"summary_en\":\"\"}\n"
"5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
"6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
"7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
"8. Kein Text vor oder nach dem JSON\n\n"
"### Ausgabe\n"
"Jetzt antworte."
"Jetzt antworte mit dem JSON:"
)
@staticmethod
async def summarize_article(
client: httpx.AsyncClient,
url: str
url: str,
title: str = "",
description: str = ""
) -> Optional[ArticleSummary]:
"""
Generate a summary of an article using the LLM.
@@ -245,31 +331,85 @@ class NewsFetcher:
Args:
client: An active httpx AsyncClient for making requests
url: URL of the article to summarize
title: Article title from RSS feed
description: Article description from RSS feed
Returns:
A dictionary containing the article title and summaries in German and English,
or None if summarization failed
"""
prompt = NewsFetcher.build_prompt(url)
logger.info(f"🤖 Starting article summarization for: {url}")
logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")
prompt = NewsFetcher.build_prompt(url, title, description)
payload = {
"model": LLM_MODEL,
"prompt": prompt,
"stream": False,
"temperature": 0.2,
"temperature": 0.3, # Slightly increase creativity
"format": "json"
}
try:
logger.debug(f"📤 Sending request to Ollama API with model: {LLM_MODEL}")
start_time = datetime.now()
response = await client.post(
f"{OLLAMA_HOST}/api/generate",
json=payload,
timeout=LLM_TIMEOUT_SECONDS
)
elapsed_time = (datetime.now() - start_time).total_seconds()
logger.info(f"⏱️ Ollama API response received in {elapsed_time:.2f}s")
response.raise_for_status()
result = response.json()
return cast(ArticleSummary, result["response"])
except (KeyError, ValueError, httpx.HTTPError, json.JSONDecodeError) as e:
print(f"Error summarizing article {url}: {e}")
logger.debug(f"📥 Raw Ollama response keys: {list(result.keys())}")
# Parse the JSON string returned by the LLM
llm_response = result["response"]
logger.debug(f"🔍 LLM response type: {type(llm_response)}")
logger.debug(f"🔍 LLM response preview: {str(llm_response)[:200]}...")
if isinstance(llm_response, str):
logger.debug("📋 Parsing JSON string response")
summary_data = json.loads(llm_response)
else:
logger.debug("📋 Using direct dict response")
summary_data = llm_response
# Validate required fields
required_fields = ["title", "summary_de", "summary_en"]
missing_fields = [field for field in required_fields if field not in summary_data]
if missing_fields:
logger.warning(f"⚠️ Missing required fields in summary: {missing_fields}")
return None
# Log summary quality metrics
title_len = len(summary_data.get("title", ""))
de_words = len(summary_data.get("summary_de", "").split())
en_words = len(summary_data.get("summary_en", "").split())
logger.info(f"✅ Summary generated - Title: {title_len} chars, DE: {de_words} words, EN: {en_words} words")
if de_words > 160 or en_words > 160:
logger.warning(f"⚠️ Summary exceeds word limit - DE: {de_words}/160, EN: {en_words}/160")
return cast(ArticleSummary, summary_data)
except json.JSONDecodeError as e:
logger.error(f"❌ JSON parsing error for {url}: {e}")
logger.error(f"🔍 Raw response that failed to parse: {llm_response[:500]}...")
return None
except httpx.HTTPError as e:
logger.error(f"❌ HTTP error for {url}: {e}")
return None
except Exception as e:
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
return None
@staticmethod
@@ -278,16 +418,35 @@ class NewsFetcher:
Fetch articles from all feeds and store summaries in the database.
This is the main function that runs periodically to update the news database.
"""
logger.info("🚀 Starting scheduled news harvest...")
harvest_start_time = datetime.now()
total_feeds = 0
total_articles = 0
successful_articles = 0
failed_articles = 0
try:
# Get all feeds from the database
with db_manager.get_cursor() as cursor:
cursor.execute("SELECT country, url FROM feeds")
feeds = cursor.fetchall()
total_feeds = len(feeds)
logger.info(f"📡 Found {total_feeds} feeds to process")
# Process each feed
async with httpx.AsyncClient() as client:
for feed_row in feeds:
await NewsFetcher._process_feed(client, feed_row)
for i, feed_row in enumerate(feeds, 1):
logger.info(f"📰 Processing feed {i}/{total_feeds}: {feed_row['url']} ({feed_row['country']})")
feed_stats = await NewsFetcher._process_feed(client, feed_row)
total_articles += feed_stats['total']
successful_articles += feed_stats['successful']
failed_articles += feed_stats['failed']
logger.info(f"📊 Feed {i} complete: {feed_stats['successful']}/{feed_stats['total']} articles processed successfully")
# Update last sync timestamp
current_time = int(datetime.now(timezone.utc).timestamp())
@@ -296,30 +455,66 @@ class NewsFetcher:
"UPDATE meta SET val=? WHERE key='last_sync'",
(str(current_time),)
)
harvest_duration = (datetime.now() - harvest_start_time).total_seconds()
logger.info(f"✅ News harvest completed in {harvest_duration:.2f}s")
logger.info(f"📊 Final stats: {total_feeds} feeds, {successful_articles}/{total_articles} articles processed successfully")
except Exception as e:
print(f"Error harvesting feeds: {e}")
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
raise
@staticmethod
async def _process_feed(
client: httpx.AsyncClient,
feed_row: sqlite3.Row
) -> None:
) -> Dict[str, int]:
"""
Process a single feed, fetching and summarizing all articles.
Args:
client: An active httpx AsyncClient for making requests
feed_row: A database row containing feed information
Returns:
Dictionary with processing statistics
"""
stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0}
try:
logger.debug(f"🔍 Parsing RSS feed: {feed_row['url']}")
feed_data = feedparser.parse(feed_row["url"])
for entry in feed_data.entries:
if hasattr(feed_data, 'bozo') and feed_data.bozo:
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
if hasattr(feed_data, 'bozo_exception'):
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
total_entries = len(feed_data.entries)
logger.info(f"📄 Found {total_entries} entries in feed")
if total_entries == 0:
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
return stats
for i, entry in enumerate(feed_data.entries, 1):
stats['total'] += 1
logger.debug(f"📝 Processing article {i}/{total_entries}")
# Skip entries without links or published dates
if not hasattr(entry, "link") or not hasattr(entry, "published_parsed"):
if not hasattr(entry, "link"):
logger.debug(f"⏩ Skipping entry {i}: no link")
stats['skipped'] += 1
continue
article_id = entry.link
if not hasattr(entry, "published_parsed"):
logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5
stats['skipped'] += 1
continue
article_url = entry.link
logger.debug(f"🔗 Processing article: {article_url}")
# Parse the published date
try:
@@ -327,39 +522,80 @@ class NewsFetcher:
*entry.published_parsed[:6],
tzinfo=timezone.utc
)
except (TypeError, ValueError):
# Skip entries with invalid dates
logger.debug(f"📅 Article published: {published}")
except (TypeError, ValueError) as e:
logger.debug(f"⏩ Skipping entry {i}: invalid date - {e}")
stats['skipped'] += 1
continue
# Check if article already exists
with db_manager.get_cursor() as cursor:
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
if cursor.fetchone():
logger.debug(f"⏩ Skipping entry {i}: article already exists")
stats['skipped'] += 1
continue
# Get article summary
summary = await NewsFetcher.summarize_article(client, entry.link)
logger.debug(f"🤖 Requesting summary for article {i}")
# Extract title and description from RSS entry
rss_title = getattr(entry, 'title', '')
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
summary = await NewsFetcher.summarize_article(
client,
article_url,
title=rss_title,
description=rss_description
)
if not summary:
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
stats['failed'] += 1
continue
published_timestamp = int(published.timestamp())
# Handle source field - it can be a string or dict
source_value = entry.get("source", feed_row["url"])
if isinstance(source_value, dict):
source_title = source_value.get("title", feed_row["url"])
else:
source_title = source_value if source_value else feed_row["url"]
logger.debug(f"💾 Storing article in database")
# Store in database
with db_manager.get_cursor() as cursor:
cursor.execute(
"""
INSERT INTO news (
id, title, summary_de, summary_en, published,
source, country, source_feed
try:
with db_manager.get_cursor() as cursor:
cursor.execute(
"""
INSERT INTO news (title, description, url, published, country)
VALUES (?, ?, ?, ?, ?)
""",
(
summary["title"],
summary["summary_de"],
article_url,
published_timestamp,
feed_row["country"],
)
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (id) DO NOTHING
""",
(
article_id,
summary["title"],
summary["summary_de"],
summary["summary_en"],
published.isoformat(),
entry.get("source", {}).get("title", feed_row["url"]),
feed_row["country"],
feed_row["url"],
)
)
logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...")
stats['successful'] += 1
except Exception as db_error:
logger.error(f"❌ Database error for article {i}: {db_error}")
stats['failed'] += 1
continue
except Exception as e:
print(f"Error processing feed {feed_row['url']}: {e}")
logger.error(f"Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
logger.info(f"📊 Feed processing complete: {stats['successful']} successful, {stats['failed']} failed, {stats['skipped']} skipped out of {stats['total']} total")
return stats
# Initialize scheduler
@@ -370,7 +606,11 @@ scheduler.add_job(
hours=CRON_HOURS,
id="harvest"
)
print(f"Starting scheduler with {CRON_HOURS} hours interval")
scheduler.start()
print("Scheduler started")
print(f"Next run: {scheduler.get_job('harvest').next_run_time}")
# Pydantic models for API requests and responses
@@ -414,7 +654,7 @@ class HoursResponse(BaseModel):
# Dependency for getting a database cursor
def get_db():
async def get_db():
"""
Dependency that provides a database cursor.
@@ -445,14 +685,20 @@ async def get_news(
Returns:
List of news articles matching the criteria
"""
db.execute(
"""
SELECT * FROM news
WHERE country=? AND published BETWEEN ? AND ?
ORDER BY published DESC
""",
(country, from_, to)
)
try:
datetime.fromisoformat(from_)
datetime.fromisoformat(to)
except ValueError:
raise HTTPException(400, "Invalid date format")
finally:
db.execute(
"""
SELECT id, title, description, url, published, country, created_at FROM news
WHERE country=? AND published BETWEEN ? AND ?
ORDER BY published DESC
""",
(country, from_, to)
)
return [dict(row) for row in db.fetchall()]
@@ -622,9 +868,11 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
)
# Trigger sync in background
asyncio.create_task(NewsFetcher.harvest_feeds())
return {"status": "triggered"}
try:
task = asyncio.create_task(NewsFetcher.harvest_feeds())
return {"status": "triggered", "task_id": id(task)}
except Exception as e:
raise HTTPException(500, f"Failed to trigger sync: {str(e)}")
# Mount static frontend
frontend_path = os.path.join(

View File

@@ -1,7 +1,6 @@
{
"DE": [
"https://www.tagesschau.de/xml/rss2",
"https://www.spiegel.de/schlagzeilen/tops/index.rss"
"https://www.tagesschau.de/xml/rss2"
],
"EU": [
"https://www.euronews.com/rss?level=theme&name=news"

View File

@@ -37,7 +37,7 @@ onMounted(async () => {
}}</p>
<p class="mt-2">{{ a.summary_de }}</p>
<p class="italic mt-2 text-sm text-gray-700">{{ a.summary_en }}</p>
<a :href="a.id" target="_blank" class="text-blue-600 hover:underline">Original </a>
<a :href="a.url" target="_blank" class="text-blue-600 hover:underline">Original </a>
</article>
</main>
</template>

View File

@@ -4,9 +4,10 @@ import {set, get} from 'idb-keyval';
export const useNews = defineStore('news', {
state: () => ({
articles: [] as {
id: string,
id: number,
published: number,
title: string,
url: string,
source: string,
summary_de: string,
summary_en: string