enhance: improve news fetching, processing, and logging architecture
This commit is contained in:
@@ -14,8 +14,10 @@ import os
|
||||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from http.client import HTTPException
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast
|
||||
import logging
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
@@ -27,7 +29,7 @@ from pydantic import BaseModel
|
||||
# Constants
|
||||
DB_PATH = Path("owlynews.sqlite")
|
||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
MIN_CRON_HOURS = 0.5
|
||||
MIN_CRON_HOURS = 0.1
|
||||
DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
|
||||
CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
|
||||
SYNC_COOLDOWN_MINUTES = 30
|
||||
@@ -35,6 +37,13 @@ LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
|
||||
LLM_TIMEOUT_SECONDS = 180
|
||||
OLLAMA_API_TIMEOUT_SECONDS = 10
|
||||
|
||||
# Add logging configuration at the top of your file
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# FastAPI app initialization
|
||||
app = FastAPI(
|
||||
title="Owly News Summariser",
|
||||
@@ -46,20 +55,19 @@ app = FastAPI(
|
||||
SCHEMA_SQL = [
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS news (
|
||||
id TEXT PRIMARY KEY, -- e.g. URL as unique identifier
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT NOT NULL,
|
||||
summary_de TEXT,
|
||||
summary_en TEXT,
|
||||
published INTEGER, -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer
|
||||
source TEXT,
|
||||
country TEXT,
|
||||
source_feed TEXT
|
||||
description TEXT,
|
||||
url TEXT NOT NULL,
|
||||
published TEXT NOT NULL,
|
||||
country TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""",
|
||||
"CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
id INTEGER PRIMARY KEY, -- auto-increment via rowid
|
||||
id INTEGER PRIMARY KEY,
|
||||
country TEXT,
|
||||
url TEXT UNIQUE NOT NULL
|
||||
)
|
||||
@@ -94,23 +102,23 @@ class DatabaseManager:
|
||||
db_path: Path to the SQLite database file
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._connection = None
|
||||
self._initialize_db()
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""
|
||||
Get or create a database connection.
|
||||
Create a thread-safe database connection.
|
||||
|
||||
Returns:
|
||||
An active SQLite connection
|
||||
"""
|
||||
if self._connection is None:
|
||||
self._connection = sqlite3.connect(
|
||||
self.db_path,
|
||||
check_same_thread=False
|
||||
)
|
||||
self._connection.row_factory = sqlite3.Row
|
||||
return self._connection
|
||||
conn = sqlite3.connect(
|
||||
self.db_path,
|
||||
check_same_thread=False, # Allow use across threads
|
||||
timeout=20.0 # Add timeout to prevent deadlocks
|
||||
)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
return conn
|
||||
|
||||
@contextmanager
|
||||
def get_cursor(self) -> Iterator[sqlite3.Cursor]:
|
||||
@@ -119,70 +127,138 @@ class DatabaseManager:
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
|
||||
Example:
|
||||
```python
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute("SELECT * FROM table")
|
||||
results = cursor.fetchall()
|
||||
```
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
conn = None
|
||||
try:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
yield cursor
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
except Exception as e:
|
||||
if conn:
|
||||
conn.rollback()
|
||||
raise e
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def _initialize_db(self) -> None:
|
||||
"""
|
||||
Initialize the database schema and default settings.
|
||||
Creates tables if they don't exist and inserts default values.
|
||||
"""
|
||||
logger.info("🗄️ Initializing database...")
|
||||
|
||||
# Create schema
|
||||
with self.get_cursor() as cursor:
|
||||
for stmt in SCHEMA_SQL:
|
||||
for i, stmt in enumerate(SCHEMA_SQL):
|
||||
logger.debug(f"📝 Executing schema statement {i+1}/{len(SCHEMA_SQL)}")
|
||||
cursor.execute(stmt)
|
||||
|
||||
# Add migration for description column if it doesn't exist
|
||||
try:
|
||||
cursor.execute("SELECT description FROM news LIMIT 1")
|
||||
logger.debug("✅ Description column exists")
|
||||
except sqlite3.OperationalError:
|
||||
# Column doesn't exist, add it
|
||||
logger.info("🔧 Adding missing description column to news table...")
|
||||
cursor.execute("ALTER TABLE news ADD COLUMN description TEXT")
|
||||
|
||||
# Insert initial settings
|
||||
cursor.execute(
|
||||
"INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
|
||||
("cron_hours", str(CRON_HOURS))
|
||||
)
|
||||
logger.debug("⚙️ Settings initialized")
|
||||
|
||||
# Insert initial metadata
|
||||
cursor.execute(
|
||||
"INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
|
||||
("last_sync", "0")
|
||||
)
|
||||
logger.debug("📊 Metadata initialized")
|
||||
|
||||
# Check current feed count
|
||||
cursor.execute("SELECT COUNT(*) as count FROM feeds")
|
||||
feed_count = cursor.fetchone()["count"]
|
||||
logger.info(f"📡 Current feeds in database: {feed_count}")
|
||||
|
||||
# Seed feeds if none exist
|
||||
cursor.execute("SELECT COUNT(*) as count FROM feeds")
|
||||
if cursor.fetchone()["count"] == 0:
|
||||
self._seed_feeds()
|
||||
if feed_count == 0:
|
||||
logger.info("🌱 No feeds found, starting seeding process...")
|
||||
feeds_added = self._seed_feeds(cursor) # Pass the existing cursor
|
||||
|
||||
def _seed_feeds(self) -> None:
|
||||
# Verify seeding worked
|
||||
cursor.execute("SELECT COUNT(*) as count FROM feeds")
|
||||
new_feed_count = cursor.fetchone()["count"]
|
||||
logger.info(f"📡 Feeds after seeding: {new_feed_count}")
|
||||
else:
|
||||
logger.info("📡 Feeds already exist, skipping seeding")
|
||||
|
||||
logger.info("✅ Database initialization complete")
|
||||
|
||||
def _seed_feeds(self, cursor: sqlite3.Cursor) -> int:
|
||||
"""
|
||||
Seed the database with initial feeds from the seed_feeds.json file.
|
||||
Only runs if the feeds table is empty.
|
||||
|
||||
Args:
|
||||
cursor: Database cursor to use for operations
|
||||
|
||||
Returns:
|
||||
Number of feeds added
|
||||
"""
|
||||
logger.info("🌱 Seeding feeds from seed_feeds.json...")
|
||||
feeds_added = 0
|
||||
|
||||
try:
|
||||
seed_path = Path(__file__).with_name("seed_feeds.json")
|
||||
logger.debug(f"📁 Looking for seed file at: {seed_path}")
|
||||
|
||||
if not seed_path.exists():
|
||||
logger.error(f"❌ Seed file not found at: {seed_path}")
|
||||
return feeds_added
|
||||
|
||||
with open(seed_path, "r") as f:
|
||||
seed_data = json.load(f)
|
||||
|
||||
with self.get_cursor() as cursor:
|
||||
for country, urls in seed_data.items():
|
||||
for url in urls:
|
||||
logger.debug(f"📄 Loaded seed data: {seed_data}")
|
||||
|
||||
for country, urls in seed_data.items():
|
||||
logger.info(f"🌍 Processing {len(urls)} feeds for country: {country}")
|
||||
for url in urls:
|
||||
try:
|
||||
cursor.execute(
|
||||
"INSERT INTO feeds (country, url) VALUES (?, ?) "
|
||||
"ON CONFLICT (url) DO NOTHING",
|
||||
(country, url)
|
||||
)
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
print(f"Error seeding feeds: {e}")
|
||||
# Check if the insert actually added a row
|
||||
if cursor.rowcount > 0:
|
||||
feeds_added += 1
|
||||
logger.debug(f"✅ Added feed: {url} ({country})")
|
||||
else:
|
||||
logger.debug(f"⏩ Feed already exists: {url} ({country})")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to add feed {url}: {e}")
|
||||
|
||||
logger.info(f"🌱 Seeding complete: {feeds_added} feeds added")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"❌ Invalid JSON in seed_feeds.json: {e}")
|
||||
# Re-read file content for error reporting
|
||||
try:
|
||||
with open(seed_path, "r") as f:
|
||||
content = f.read()
|
||||
logger.error(f"📄 File content causing error: {content}")
|
||||
except:
|
||||
logger.error("📄 Could not re-read file for error reporting")
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"❌ Seed file not found: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error seeding feeds: {e}")
|
||||
|
||||
return feeds_added
|
||||
|
||||
|
||||
# Initialize database manager
|
||||
@@ -203,41 +279,51 @@ class NewsFetcher:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def build_prompt(url: str) -> str:
|
||||
def build_prompt(url: str, title: str = "", description: str = "") -> str:
|
||||
"""
|
||||
Generate a prompt for the LLM to summarize an article.
|
||||
|
||||
Args:
|
||||
url: Public URL of the article to summarize
|
||||
title: Article title from RSS feed (optional)
|
||||
description: Article description from RSS feed (optional)
|
||||
|
||||
Returns:
|
||||
A formatted prompt string that instructs the LLM to generate
|
||||
a JSON response with title and summaries in German and English
|
||||
|
||||
Note:
|
||||
LLMs like qwen2 don't have native web access; the model will
|
||||
generate summaries based on its training data and the URL.
|
||||
"""
|
||||
context_info = []
|
||||
if title:
|
||||
context_info.append(f"Titel: {title}")
|
||||
if description:
|
||||
context_info.append(f"Beschreibung: {description}")
|
||||
|
||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||
|
||||
return (
|
||||
"### Aufgabe\n"
|
||||
f"Du bekommst eine öffentliche URL: {url}\n"
|
||||
f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n"
|
||||
f"URL: {url}\n"
|
||||
f"Verfügbare Informationen:\n{context}\n\n"
|
||||
"### Regeln\n"
|
||||
"1. **Entnimm den Inhalt nicht automatisch.** "
|
||||
"Falls dir der Text nicht vorliegt, antworte mit leeren Strings.\n"
|
||||
"2. Gib ausschließlich **gültiges minifiziertes JSON** zurück – "
|
||||
"kein Markdown, keine Kommentare.\n"
|
||||
"3. Struktur:\n"
|
||||
"{\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
|
||||
"4. summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst.\n"
|
||||
"5. Kein Text vor oder nach dem JSON.\n"
|
||||
"1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n"
|
||||
"2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
|
||||
"3. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
|
||||
"4. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
|
||||
"5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
|
||||
"6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
|
||||
"7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
|
||||
"8. Kein Text vor oder nach dem JSON\n\n"
|
||||
"### Ausgabe\n"
|
||||
"Jetzt antworte."
|
||||
"Jetzt antworte mit dem JSON:"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def summarize_article(
|
||||
client: httpx.AsyncClient,
|
||||
url: str
|
||||
url: str,
|
||||
title: str = "",
|
||||
description: str = ""
|
||||
) -> Optional[ArticleSummary]:
|
||||
"""
|
||||
Generate a summary of an article using the LLM.
|
||||
@@ -245,31 +331,85 @@ class NewsFetcher:
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
url: URL of the article to summarize
|
||||
title: Article title from RSS feed
|
||||
description: Article description from RSS feed
|
||||
|
||||
Returns:
|
||||
A dictionary containing the article title and summaries in German and English,
|
||||
or None if summarization failed
|
||||
"""
|
||||
prompt = NewsFetcher.build_prompt(url)
|
||||
logger.info(f"🤖 Starting article summarization for: {url}")
|
||||
logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
|
||||
logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")
|
||||
|
||||
prompt = NewsFetcher.build_prompt(url, title, description)
|
||||
payload = {
|
||||
"model": LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"temperature": 0.2,
|
||||
"temperature": 0.3, # Slightly increase creativity
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
try:
|
||||
logger.debug(f"📤 Sending request to Ollama API with model: {LLM_MODEL}")
|
||||
start_time = datetime.now()
|
||||
|
||||
response = await client.post(
|
||||
f"{OLLAMA_HOST}/api/generate",
|
||||
json=payload,
|
||||
timeout=LLM_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
elapsed_time = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(f"⏱️ Ollama API response received in {elapsed_time:.2f}s")
|
||||
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return cast(ArticleSummary, result["response"])
|
||||
except (KeyError, ValueError, httpx.HTTPError, json.JSONDecodeError) as e:
|
||||
print(f"Error summarizing article {url}: {e}")
|
||||
|
||||
logger.debug(f"📥 Raw Ollama response keys: {list(result.keys())}")
|
||||
|
||||
# Parse the JSON string returned by the LLM
|
||||
llm_response = result["response"]
|
||||
logger.debug(f"🔍 LLM response type: {type(llm_response)}")
|
||||
logger.debug(f"🔍 LLM response preview: {str(llm_response)[:200]}...")
|
||||
|
||||
if isinstance(llm_response, str):
|
||||
logger.debug("📋 Parsing JSON string response")
|
||||
summary_data = json.loads(llm_response)
|
||||
else:
|
||||
logger.debug("📋 Using direct dict response")
|
||||
summary_data = llm_response
|
||||
|
||||
# Validate required fields
|
||||
required_fields = ["title", "summary_de", "summary_en"]
|
||||
missing_fields = [field for field in required_fields if field not in summary_data]
|
||||
|
||||
if missing_fields:
|
||||
logger.warning(f"⚠️ Missing required fields in summary: {missing_fields}")
|
||||
return None
|
||||
|
||||
# Log summary quality metrics
|
||||
title_len = len(summary_data.get("title", ""))
|
||||
de_words = len(summary_data.get("summary_de", "").split())
|
||||
en_words = len(summary_data.get("summary_en", "").split())
|
||||
|
||||
logger.info(f"✅ Summary generated - Title: {title_len} chars, DE: {de_words} words, EN: {en_words} words")
|
||||
|
||||
if de_words > 160 or en_words > 160:
|
||||
logger.warning(f"⚠️ Summary exceeds word limit - DE: {de_words}/160, EN: {en_words}/160")
|
||||
|
||||
return cast(ArticleSummary, summary_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"❌ JSON parsing error for {url}: {e}")
|
||||
logger.error(f"🔍 Raw response that failed to parse: {llm_response[:500]}...")
|
||||
return None
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"❌ HTTP error for {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
@@ -278,16 +418,35 @@ class NewsFetcher:
|
||||
Fetch articles from all feeds and store summaries in the database.
|
||||
This is the main function that runs periodically to update the news database.
|
||||
"""
|
||||
logger.info("🚀 Starting scheduled news harvest...")
|
||||
harvest_start_time = datetime.now()
|
||||
|
||||
total_feeds = 0
|
||||
total_articles = 0
|
||||
successful_articles = 0
|
||||
failed_articles = 0
|
||||
|
||||
try:
|
||||
# Get all feeds from the database
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute("SELECT country, url FROM feeds")
|
||||
feeds = cursor.fetchall()
|
||||
total_feeds = len(feeds)
|
||||
|
||||
logger.info(f"📡 Found {total_feeds} feeds to process")
|
||||
|
||||
# Process each feed
|
||||
async with httpx.AsyncClient() as client:
|
||||
for feed_row in feeds:
|
||||
await NewsFetcher._process_feed(client, feed_row)
|
||||
for i, feed_row in enumerate(feeds, 1):
|
||||
logger.info(f"📰 Processing feed {i}/{total_feeds}: {feed_row['url']} ({feed_row['country']})")
|
||||
|
||||
feed_stats = await NewsFetcher._process_feed(client, feed_row)
|
||||
|
||||
total_articles += feed_stats['total']
|
||||
successful_articles += feed_stats['successful']
|
||||
failed_articles += feed_stats['failed']
|
||||
|
||||
logger.info(f"📊 Feed {i} complete: {feed_stats['successful']}/{feed_stats['total']} articles processed successfully")
|
||||
|
||||
# Update last sync timestamp
|
||||
current_time = int(datetime.now(timezone.utc).timestamp())
|
||||
@@ -296,30 +455,66 @@ class NewsFetcher:
|
||||
"UPDATE meta SET val=? WHERE key='last_sync'",
|
||||
(str(current_time),)
|
||||
)
|
||||
|
||||
harvest_duration = (datetime.now() - harvest_start_time).total_seconds()
|
||||
|
||||
logger.info(f"✅ News harvest completed in {harvest_duration:.2f}s")
|
||||
logger.info(f"📊 Final stats: {total_feeds} feeds, {successful_articles}/{total_articles} articles processed successfully")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error harvesting feeds: {e}")
|
||||
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
async def _process_feed(
|
||||
client: httpx.AsyncClient,
|
||||
feed_row: sqlite3.Row
|
||||
) -> None:
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Process a single feed, fetching and summarizing all articles.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
feed_row: A database row containing feed information
|
||||
|
||||
Returns:
|
||||
Dictionary with processing statistics
|
||||
"""
|
||||
stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0}
|
||||
|
||||
try:
|
||||
logger.debug(f"🔍 Parsing RSS feed: {feed_row['url']}")
|
||||
feed_data = feedparser.parse(feed_row["url"])
|
||||
|
||||
for entry in feed_data.entries:
|
||||
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
||||
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
|
||||
if hasattr(feed_data, 'bozo_exception'):
|
||||
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
|
||||
|
||||
total_entries = len(feed_data.entries)
|
||||
logger.info(f"📄 Found {total_entries} entries in feed")
|
||||
|
||||
if total_entries == 0:
|
||||
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
|
||||
return stats
|
||||
|
||||
for i, entry in enumerate(feed_data.entries, 1):
|
||||
stats['total'] += 1
|
||||
logger.debug(f"📝 Processing article {i}/{total_entries}")
|
||||
|
||||
# Skip entries without links or published dates
|
||||
if not hasattr(entry, "link") or not hasattr(entry, "published_parsed"):
|
||||
if not hasattr(entry, "link"):
|
||||
logger.debug(f"⏩ Skipping entry {i}: no link")
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
article_id = entry.link
|
||||
if not hasattr(entry, "published_parsed"):
|
||||
logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
article_url = entry.link
|
||||
logger.debug(f"🔗 Processing article: {article_url}")
|
||||
|
||||
# Parse the published date
|
||||
try:
|
||||
@@ -327,39 +522,80 @@ class NewsFetcher:
|
||||
*entry.published_parsed[:6],
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
# Skip entries with invalid dates
|
||||
logger.debug(f"📅 Article published: {published}")
|
||||
except (TypeError, ValueError) as e:
|
||||
logger.debug(f"⏩ Skipping entry {i}: invalid date - {e}")
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if article already exists
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
|
||||
if cursor.fetchone():
|
||||
logger.debug(f"⏩ Skipping entry {i}: article already exists")
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Get article summary
|
||||
summary = await NewsFetcher.summarize_article(client, entry.link)
|
||||
logger.debug(f"🤖 Requesting summary for article {i}")
|
||||
|
||||
# Extract title and description from RSS entry
|
||||
rss_title = getattr(entry, 'title', '')
|
||||
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
|
||||
|
||||
summary = await NewsFetcher.summarize_article(
|
||||
client,
|
||||
article_url,
|
||||
title=rss_title,
|
||||
description=rss_description
|
||||
)
|
||||
|
||||
if not summary:
|
||||
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
published_timestamp = int(published.timestamp())
|
||||
|
||||
# Handle source field - it can be a string or dict
|
||||
source_value = entry.get("source", feed_row["url"])
|
||||
if isinstance(source_value, dict):
|
||||
source_title = source_value.get("title", feed_row["url"])
|
||||
else:
|
||||
source_title = source_value if source_value else feed_row["url"]
|
||||
|
||||
logger.debug(f"💾 Storing article in database")
|
||||
|
||||
# Store in database
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO news (
|
||||
id, title, summary_de, summary_en, published,
|
||||
source, country, source_feed
|
||||
try:
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO news (title, description, url, published, country)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
summary["title"],
|
||||
summary["summary_de"],
|
||||
article_url,
|
||||
published_timestamp,
|
||||
feed_row["country"],
|
||||
)
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
""",
|
||||
(
|
||||
article_id,
|
||||
summary["title"],
|
||||
summary["summary_de"],
|
||||
summary["summary_en"],
|
||||
published.isoformat(),
|
||||
entry.get("source", {}).get("title", feed_row["url"]),
|
||||
feed_row["country"],
|
||||
feed_row["url"],
|
||||
)
|
||||
)
|
||||
|
||||
logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...")
|
||||
stats['successful'] += 1
|
||||
|
||||
except Exception as db_error:
|
||||
logger.error(f"❌ Database error for article {i}: {db_error}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing feed {feed_row['url']}: {e}")
|
||||
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
|
||||
|
||||
logger.info(f"📊 Feed processing complete: {stats['successful']} successful, {stats['failed']} failed, {stats['skipped']} skipped out of {stats['total']} total")
|
||||
return stats
|
||||
|
||||
|
||||
# Initialize scheduler
|
||||
@@ -370,7 +606,11 @@ scheduler.add_job(
|
||||
hours=CRON_HOURS,
|
||||
id="harvest"
|
||||
)
|
||||
print(f"Starting scheduler with {CRON_HOURS} hours interval")
|
||||
scheduler.start()
|
||||
print("Scheduler started")
|
||||
print(f"Next run: {scheduler.get_job('harvest').next_run_time}")
|
||||
|
||||
|
||||
|
||||
# Pydantic models for API requests and responses
|
||||
@@ -414,7 +654,7 @@ class HoursResponse(BaseModel):
|
||||
|
||||
|
||||
# Dependency for getting a database cursor
|
||||
def get_db():
|
||||
async def get_db():
|
||||
"""
|
||||
Dependency that provides a database cursor.
|
||||
|
||||
@@ -445,14 +685,20 @@ async def get_news(
|
||||
Returns:
|
||||
List of news articles matching the criteria
|
||||
"""
|
||||
db.execute(
|
||||
"""
|
||||
SELECT * FROM news
|
||||
WHERE country=? AND published BETWEEN ? AND ?
|
||||
ORDER BY published DESC
|
||||
""",
|
||||
(country, from_, to)
|
||||
)
|
||||
try:
|
||||
datetime.fromisoformat(from_)
|
||||
datetime.fromisoformat(to)
|
||||
except ValueError:
|
||||
raise HTTPException(400, "Invalid date format")
|
||||
finally:
|
||||
db.execute(
|
||||
"""
|
||||
SELECT id, title, description, url, published, country, created_at FROM news
|
||||
WHERE country=? AND published BETWEEN ? AND ?
|
||||
ORDER BY published DESC
|
||||
""",
|
||||
(country, from_, to)
|
||||
)
|
||||
return [dict(row) for row in db.fetchall()]
|
||||
|
||||
|
||||
@@ -622,9 +868,11 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
|
||||
)
|
||||
|
||||
# Trigger sync in background
|
||||
asyncio.create_task(NewsFetcher.harvest_feeds())
|
||||
return {"status": "triggered"}
|
||||
|
||||
try:
|
||||
task = asyncio.create_task(NewsFetcher.harvest_feeds())
|
||||
return {"status": "triggered", "task_id": id(task)}
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to trigger sync: {str(e)}")
|
||||
|
||||
# Mount static frontend
|
||||
frontend_path = os.path.join(
|
||||
|
Reference in New Issue
Block a user