@@ -14,8 +14,10 @@ import os
import sqlite3
import sqlite3
from contextlib import contextmanager
from contextlib import contextmanager
from datetime import datetime , timezone , timedelta
from datetime import datetime , timezone , timedelta
from http . client import HTTPException
from pathlib import Path
from pathlib import Path
from typing import Dict , List , Optional , Any , Union , Iterator , Tuple , TypedDict , cast
from typing import Dict , List , Optional , Any , Union , Iterator , Tuple , TypedDict , cast
import logging
import feedparser
import feedparser
import httpx
import httpx
@@ -27,7 +29,7 @@ from pydantic import BaseModel
# Constants
# Constants
DB_PATH = Path ( " owlynews.sqlite " )
DB_PATH = Path ( " owlynews.sqlite " )
OLLAMA_HOST = os . getenv ( " OLLAMA_HOST " , " http://localhost:11434 " )
OLLAMA_HOST = os . getenv ( " OLLAMA_HOST " , " http://localhost:11434 " )
MIN_CRON_HOURS = 0.5
MIN_CRON_HOURS = 0.1
DEFAULT_CRON_HOURS = float ( os . getenv ( " CRON_HOURS " , MIN_CRON_HOURS ) )
DEFAULT_CRON_HOURS = float ( os . getenv ( " CRON_HOURS " , MIN_CRON_HOURS ) )
CRON_HOURS = max ( MIN_CRON_HOURS , DEFAULT_CRON_HOURS )
CRON_HOURS = max ( MIN_CRON_HOURS , DEFAULT_CRON_HOURS )
SYNC_COOLDOWN_MINUTES = 30
SYNC_COOLDOWN_MINUTES = 30
@@ -35,6 +37,13 @@ LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
LLM_TIMEOUT_SECONDS = 180
LLM_TIMEOUT_SECONDS = 180
OLLAMA_API_TIMEOUT_SECONDS = 10
OLLAMA_API_TIMEOUT_SECONDS = 10
# Add logging configuration at the top of your file
logging . basicConfig (
level = logging . INFO ,
format = ' %(asctime)s - %(name)s - %(levelname)s - %(message)s '
)
logger = logging . getLogger ( __name__ )
# FastAPI app initialization
# FastAPI app initialization
app = FastAPI (
app = FastAPI (
title = " Owly News Summariser " ,
title = " Owly News Summariser " ,
@@ -46,20 +55,19 @@ app = FastAPI(
SCHEMA_SQL = [
SCHEMA_SQL = [
"""
"""
CREATE TABLE IF NOT EXISTS news (
CREATE TABLE IF NOT EXISTS news (
id TEXT PRIMARY KEY, -- e.g. URL as unique identifier
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
title TEXT NOT NULL,
summary_de TEXT,
description TEXT,
summary_en TEXT ,
url TEXT NOT NULL ,
published INTEGER, -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer
published TEXT NOT NULL,
s ource TEXT,
c ountry TEXT NOT NULL ,
country TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
source_feed TEXT
)
)
""" ,
""" ,
" CREATE INDEX IF NOT EXISTS idx_news_published ON news(published) " ,
" CREATE INDEX IF NOT EXISTS idx_news_published ON news(published) " ,
"""
"""
CREATE TABLE IF NOT EXISTS feeds (
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY, -- auto-increment via rowid
id INTEGER PRIMARY KEY,
country TEXT,
country TEXT,
url TEXT UNIQUE NOT NULL
url TEXT UNIQUE NOT NULL
)
)
@@ -94,23 +102,23 @@ class DatabaseManager:
db_path: Path to the SQLite database file
db_path: Path to the SQLite database file
"""
"""
self . db_path = db_path
self . db_path = db_path
self . _connection = None
self . _initialize_db ( )
self . _initialize_db ( )
def _get_connection ( self ) - > sqlite3 . Connection :
def _get_connection ( self ) - > sqlite3 . Connection :
"""
"""
Get or create a database connection.
Create a thread-safe database connection.
Returns:
Returns:
An active SQLite connection
An active SQLite connection
"""
"""
if self . _ connection is None :
conn = sqlite3 . connect (
self . _connection = sqlite3 . connect (
self . db_path ,
self . db_path ,
check_same_thread = False , # Allow use across threads
check_same_thread = False
timeout = 20.0 # Add timeout to prevent deadlocks
)
)
self . _connectio n . row_factory = sqlite3 . Row
con n. row_factory = sqlite3 . Row
return self . _connection
conn . execute ( " PRAGMA journal_mode=WAL " )
return conn
@contextmanager
@contextmanager
def get_cursor ( self ) - > Iterator [ sqlite3 . Cursor ] :
def get_cursor ( self ) - > Iterator [ sqlite3 . Cursor ] :
@@ -119,70 +127,138 @@ class DatabaseManager:
Yields:
Yields:
A database cursor for executing SQL statements
A database cursor for executing SQL statements
Example:
```python
with db_manager.get_cursor() as cursor:
cursor.execute( " SELECT * FROM table " )
results = cursor.fetchall()
```
"""
"""
conn = self . _get_connection ( )
conn = None
cursor = conn . cursor ( )
try :
try :
conn = self . _get_connection ( )
cursor = conn . cursor ( )
yield cursor
yield cursor
conn . commit ( )
conn . commit ( )
except Exception :
except Exception as e :
conn . rollback ( )
if conn :
raise
conn . rollback ( )
raise e
finally :
if conn :
conn . close ( )
def _initialize_db ( self ) - > None :
def _initialize_db ( self ) - > None :
"""
"""
Initialize the database schema and default settings.
Initialize the database schema and default settings.
Creates tables if they don ' t exist and inserts default values.
Creates tables if they don ' t exist and inserts default values.
"""
"""
logger . info ( " 🗄️ Initializing database... " )
# Create schema
# Create schema
with self . get_cursor ( ) as cursor :
with self . get_cursor ( ) as cursor :
for stmt in SCHEMA_SQL :
for i , stmt in enumerate ( SCHEMA_SQL ) :
logger . debug ( f " 📝 Executing schema statement { i + 1 } / { len ( SCHEMA_SQL ) } " )
cursor . execute ( stmt )
cursor . execute ( stmt )
# Add migration for description column if it doesn't exist
try :
cursor . execute ( " SELECT description FROM news LIMIT 1 " )
logger . debug ( " ✅ Description column exists " )
except sqlite3 . OperationalError :
# Column doesn't exist, add it
logger . info ( " 🔧 Adding missing description column to news table... " )
cursor . execute ( " ALTER TABLE news ADD COLUMN description TEXT " )
# Insert initial settings
# Insert initial settings
cursor . execute (
cursor . execute (
" INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING " ,
" INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING " ,
( " cron_hours " , str ( CRON_HOURS ) )
( " cron_hours " , str ( CRON_HOURS ) )
)
)
logger . debug ( " ⚙️ Settings initialized " )
# Insert initial metadata
# Insert initial metadata
cursor . execute (
cursor . execute (
" INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING " ,
" INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING " ,
( " last_sync " , " 0 " )
( " last_sync " , " 0 " )
)
)
logger . debug ( " 📊 Metadata initialized " )
# Check current feed count
cursor . execute ( " SELECT COUNT(*) as count FROM feeds " )
feed_count = cursor . fetchone ( ) [ " count " ]
logger . info ( f " 📡 Current feeds in database: { feed_count } " )
# Seed feeds if none exist
# Seed feeds if none exist
cursor . execute ( " SELECT COUNT(*) as count FROM feeds " )
if feed_count == 0 :
if cursor . fetchone ( ) [ " count " ] == 0 :
logger . info ( " 🌱 No feeds found, starting seeding process... " )
self . _seed_feeds ( )
feeds_added = self . _seed_feeds ( cursor ) # Pass the existing cursor
def _seed_feeds ( self ) - > None :
# Verify seeding worked
cursor . execute ( " SELECT COUNT(*) as count FROM feeds " )
new_feed_count = cursor . fetchone ( ) [ " count " ]
logger . info ( f " 📡 Feeds after seeding: { new_feed_count } " )
else :
logger . info ( " 📡 Feeds already exist, skipping seeding " )
logger . info ( " ✅ Database initialization complete " )
def _seed_feeds ( self , cursor : sqlite3 . Cursor ) - > int :
"""
"""
Seed the database with initial feeds from the seed_feeds.json file.
Seed the database with initial feeds from the seed_feeds.json file.
Only runs if the feeds table is empty.
Only runs if the feeds table is empty.
Args:
cursor: Database cursor to use for operations
Returns:
Number of feeds added
"""
"""
logger . info ( " 🌱 Seeding feeds from seed_feeds.json... " )
feeds_added = 0
try :
try :
seed_path = Path ( __file__ ) . with_name ( " seed_feeds.json " )
seed_path = Path ( __file__ ) . with_name ( " seed_feeds.json " )
logger . debug ( f " 📁 Looking for seed file at: { seed_path } " )
if not seed_path . exists ( ) :
logger . error ( f " ❌ Seed file not found at: { seed_path } " )
return feeds_added
with open ( seed_path , " r " ) as f :
with open ( seed_path , " r " ) as f :
seed_data = json . load ( f )
seed_data = json . load ( f )
with self . get_cursor ( ) as cursor :
logger . debug ( f " 📄 Loaded seed data: { seed_data } " )
for country , urls in seed_data . items ( ) :
for url in urls :
for country , urls in seed_data . items ( ) :
logger . info ( f " 🌍 Processing { len ( urls ) } feeds for country: { country } " )
for url in urls :
try :
cursor . execute (
cursor . execute (
" INSERT INTO feeds (country, url) VALUES (?, ?) "
" INSERT INTO feeds (country, url) VALUES (?, ?) "
" ON CONFLICT (url) DO NOTHING " ,
" ON CONFLICT (url) DO NOTHING " ,
( country , url )
( country , url )
)
)
except ( FileNotFoundError , json . JSONDecodeError ) as e :
# Check if the insert actually added a row
print ( f " Error seeding feeds: { e } " )
if cursor . rowcount > 0 :
feeds_added + = 1
logger . debug ( f " ✅ Added feed: { url } ( { country } ) " )
else :
logger . debug ( f " ⏩ Feed already exists: { url } ( { country } ) " )
except Exception as e :
logger . error ( f " ❌ Failed to add feed { url } : { e } " )
logger . info ( f " 🌱 Seeding complete: { feeds_added } feeds added " )
except json . JSONDecodeError as e :
logger . error ( f " ❌ Invalid JSON in seed_feeds.json: { e } " )
# Re-read file content for error reporting
try :
with open ( seed_path , " r " ) as f :
content = f . read ( )
logger . error ( f " 📄 File content causing error: { content } " )
except :
logger . error ( " 📄 Could not re-read file for error reporting " )
except FileNotFoundError as e :
logger . error ( f " ❌ Seed file not found: { e } " )
except Exception as e :
logger . error ( f " ❌ Error seeding feeds: { e } " )
return feeds_added
# Initialize database manager
# Initialize database manager
@@ -203,41 +279,51 @@ class NewsFetcher:
"""
"""
@staticmethod
@staticmethod
def build_prompt ( url : str ) - > str :
def build_prompt ( url : str , title : str = " " , description : str = " " ) - > str :
"""
"""
Generate a prompt for the LLM to summarize an article.
Generate a prompt for the LLM to summarize an article.
Args:
Args:
url: Public URL of the article to summarize
url: Public URL of the article to summarize
title: Article title from RSS feed (optional)
description: Article description from RSS feed (optional)
Returns:
Returns:
A formatted prompt string that instructs the LLM to generate
A formatted prompt string that instructs the LLM to generate
a JSON response with title and summaries in German and English
a JSON response with title and summaries in German and English
Note:
LLMs like qwen2 don ' t have native web access; the model will
generate summaries based on its training data and the URL.
"""
"""
context_info = [ ]
if title :
context_info . append ( f " Titel: { title } " )
if description :
context_info . append ( f " Beschreibung: { description } " )
context = " \n " . join ( context_info ) if context_info else " Keine zusätzlichen Informationen verfügbar. "
return (
return (
" ### Aufgabe \n "
" ### Aufgabe \n "
f " Du bekomm st eine öffentliche URL: { url } \n "
f " Du soll st eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen. \n "
f " URL: { url } \n "
f " Verfügbare Informationen: \n { context } \n \n "
" ### Regeln \n "
" ### Regeln \n "
" 1. **Entnimm den Inhalt nicht automatisch.** "
" 1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain \n "
" Falls dir der Text nicht vorliegt, antworte mit leeren Strings. \n "
" 2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL \n "
" 2 . Gib ausschließlich **gültiges minifiziertes JSON** zurück – "
" 3 . Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare \n "
" kein Markdown, keine Kommentare. \n "
" 4. Struktur: { \" title \" : \" … \" , \" summary_de \" : \" … \" , \" summary_en \" : \" … \" } \n "
" 3 . Struktur: \n "
" 5 . title: Aussagekräftiger deutscher Titel (max 100 Zeichen) \n "
" { \" title \" : \" … \" , \" summary_de \" : \" … \" , \" summary_en \" : \" … \" } \n "
" 6. summary_de: Deutsche Zusammenfassung (max 160 Wörter) \n "
" 4 . summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst. \n "
" 7 . summary_en: Englische Zusammenfassung (max 160 Wörter) \n "
" 5 . Kein Text vor oder nach dem JSON. \n "
" 8 . Kein Text vor oder nach dem JSON\n \n "
" ### Ausgabe \n "
" ### Ausgabe \n "
" Jetzt antworte. "
" Jetzt antworte mit dem JSON: "
)
)
@staticmethod
@staticmethod
async def summarize_article (
async def summarize_article (
client : httpx . AsyncClient ,
client : httpx . AsyncClient ,
url : str
url : str ,
title : str = " " ,
description : str = " "
) - > Optional [ ArticleSummary ] :
) - > Optional [ ArticleSummary ] :
"""
"""
Generate a summary of an article using the LLM.
Generate a summary of an article using the LLM.
@@ -245,31 +331,85 @@ class NewsFetcher:
Args:
Args:
client: An active httpx AsyncClient for making requests
client: An active httpx AsyncClient for making requests
url: URL of the article to summarize
url: URL of the article to summarize
title: Article title from RSS feed
description: Article description from RSS feed
Returns:
Returns:
A dictionary containing the article title and summaries in German and English,
A dictionary containing the article title and summaries in German and English,
or None if summarization failed
or None if summarization failed
"""
"""
prompt = NewsFetcher . build_prompt ( url )
logger . info ( f " 🤖 Starting article summarization for: { url } " )
logger . debug ( f " 📝 RSS Title: { title [ : 50 ] } ... " if title else " 📝 No RSS title " )
logger . debug ( f " 📄 RSS Description: { description [ : 100 ] } ... " if description else " 📄 No RSS description " )
prompt = NewsFetcher . build_prompt ( url , title , description )
payload = {
payload = {
" model " : LLM_MODEL ,
" model " : LLM_MODEL ,
" prompt " : prompt ,
" prompt " : prompt ,
" stream " : False ,
" stream " : False ,
" temperature " : 0.2 ,
" temperature " : 0.3 , # Slightly increase creativity
" format " : " json "
" format " : " json "
}
}
try :
try :
logger . debug ( f " 📤 Sending request to Ollama API with model: { LLM_MODEL } " )
start_time = datetime . now ( )
response = await client . post (
response = await client . post (
f " { OLLAMA_HOST } /api/generate " ,
f " { OLLAMA_HOST } /api/generate " ,
json = payload ,
json = payload ,
timeout = LLM_TIMEOUT_SECONDS
timeout = LLM_TIMEOUT_SECONDS
)
)
elapsed_time = ( datetime . now ( ) - start_time ) . total_seconds ( )
logger . info ( f " ⏱️ Ollama API response received in { elapsed_time : .2f } s " )
response . raise_for_status ( )
response . raise_for_status ( )
result = response . json ( )
result = response . json ( )
return cast ( ArticleSummary , result [ " response " ] )
except ( KeyError , ValueError , httpx . HTTPError , json . JSONDecodeError ) as e :
logger . debug ( f " 📥 Raw Ollama response keys: { list ( result . keys ( ) ) } " )
print ( f " Error summarizing article { url } : { e } " )
# Parse the JSON string returned by the LLM
llm_response = result [ " response " ]
logger . debug ( f " 🔍 LLM response type: { type ( llm_response ) } " )
logger . debug ( f " 🔍 LLM response preview: { str ( llm_response ) [ : 200 ] } ... " )
if isinstance ( llm_response , str ) :
logger . debug ( " 📋 Parsing JSON string response " )
summary_data = json . loads ( llm_response )
else :
logger . debug ( " 📋 Using direct dict response " )
summary_data = llm_response
# Validate required fields
required_fields = [ " title " , " summary_de " , " summary_en " ]
missing_fields = [ field for field in required_fields if field not in summary_data ]
if missing_fields :
logger . warning ( f " ⚠️ Missing required fields in summary: { missing_fields } " )
return None
# Log summary quality metrics
title_len = len ( summary_data . get ( " title " , " " ) )
de_words = len ( summary_data . get ( " summary_de " , " " ) . split ( ) )
en_words = len ( summary_data . get ( " summary_en " , " " ) . split ( ) )
logger . info ( f " ✅ Summary generated - Title: { title_len } chars, DE: { de_words } words, EN: { en_words } words " )
if de_words > 160 or en_words > 160 :
logger . warning ( f " ⚠️ Summary exceeds word limit - DE: { de_words } /160, EN: { en_words } /160 " )
return cast ( ArticleSummary , summary_data )
except json . JSONDecodeError as e :
logger . error ( f " ❌ JSON parsing error for { url } : { e } " )
logger . error ( f " 🔍 Raw response that failed to parse: { llm_response [ : 500 ] } ... " )
return None
except httpx . HTTPError as e :
logger . error ( f " ❌ HTTP error for { url } : { e } " )
return None
except Exception as e :
logger . error ( f " ❌ Unexpected error summarizing { url } : { type ( e ) . __name__ } : { e } " )
return None
return None
@staticmethod
@staticmethod
@@ -278,16 +418,35 @@ class NewsFetcher:
Fetch articles from all feeds and store summaries in the database.
Fetch articles from all feeds and store summaries in the database.
This is the main function that runs periodically to update the news database.
This is the main function that runs periodically to update the news database.
"""
"""
logger . info ( " 🚀 Starting scheduled news harvest... " )
harvest_start_time = datetime . now ( )
total_feeds = 0
total_articles = 0
successful_articles = 0
failed_articles = 0
try :
try :
# Get all feeds from the database
# Get all feeds from the database
with db_manager . get_cursor ( ) as cursor :
with db_manager . get_cursor ( ) as cursor :
cursor . execute ( " SELECT country, url FROM feeds " )
cursor . execute ( " SELECT country, url FROM feeds " )
feeds = cursor . fetchall ( )
feeds = cursor . fetchall ( )
total_feeds = len ( feeds )
logger . info ( f " 📡 Found { total_feeds } feeds to process " )
# Process each feed
# Process each feed
async with httpx . AsyncClient ( ) as client :
async with httpx . AsyncClient ( ) as client :
for feed_row in feeds :
for i , feed_row in enumerate ( feeds , 1 ) :
await NewsFetcher . _process_feed ( client , feed_row )
logger . info ( f " 📰 Processing feed { i } / { total_feeds } : { feed_row [ ' url ' ] } ( { feed_row [ ' country ' ] } ) " )
feed_stats = await NewsFetcher . _process_feed ( client , feed_row )
total_articles + = feed_stats [ ' total ' ]
successful_articles + = feed_stats [ ' successful ' ]
failed_articles + = feed_stats [ ' failed ' ]
logger . info ( f " 📊 Feed { i } complete: { feed_stats [ ' successful ' ] } / { feed_stats [ ' total ' ] } articles processed successfully " )
# Update last sync timestamp
# Update last sync timestamp
current_time = int ( datetime . now ( timezone . utc ) . timestamp ( ) )
current_time = int ( datetime . now ( timezone . utc ) . timestamp ( ) )
@@ -296,30 +455,66 @@ class NewsFetcher:
" UPDATE meta SET val=? WHERE key= ' last_sync ' " ,
" UPDATE meta SET val=? WHERE key= ' last_sync ' " ,
( str ( current_time ) , )
( str ( current_time ) , )
)
)
harvest_duration = ( datetime . now ( ) - harvest_start_time ) . total_seconds ( )
logger . info ( f " ✅ News harvest completed in { harvest_duration : .2f } s " )
logger . info ( f " 📊 Final stats: { total_feeds } feeds, { successful_articles } / { total_articles } articles processed successfully " )
except Exception as e :
except Exception as e :
print ( f " Error harvesting feeds : { e } " )
logger . error ( f " ❌ Critical error during harvest: { type ( e ) . __name__ } : { e } " )
raise
@staticmethod
@staticmethod
async def _process_feed (
async def _process_feed (
client : httpx . AsyncClient ,
client : httpx . AsyncClient ,
feed_row : sqlite3 . Row
feed_row : sqlite3 . Row
) - > None :
) - > Dict [ str , int ] :
"""
"""
Process a single feed, fetching and summarizing all articles.
Process a single feed, fetching and summarizing all articles.
Args:
Args:
client: An active httpx AsyncClient for making requests
client: An active httpx AsyncClient for making requests
feed_row: A database row containing feed information
feed_row: A database row containing feed information
Returns:
Dictionary with processing statistics
"""
"""
stats = { ' total ' : 0 , ' successful ' : 0 , ' failed ' : 0 , ' skipped ' : 0 }
try :
try :
logger . debug ( f " 🔍 Parsing RSS feed: { feed_row [ ' url ' ] } " )
feed_data = feedparser . parse ( feed_row [ " url " ] )
feed_data = feedparser . parse ( feed_row [ " url " ] )
for entry in feed_data . entries :
if hasattr ( feed_data , ' bozo ' ) and feed_data . bozo :
logger . warning ( f " ⚠️ Feed has parsing issues: { feed_row [ ' url ' ] } " )
if hasattr ( feed_data , ' bozo_exception ' ) :
logger . warning ( f " ⚠️ Feed exception: { feed_data . bozo_exception } " )
total_entries = len ( feed_data . entries )
logger . info ( f " 📄 Found { total_entries } entries in feed " )
if total_entries == 0 :
logger . warning ( f " ⚠️ No entries found in feed: { feed_row [ ' url ' ] } " )
return stats
for i , entry in enumerate ( feed_data . entries , 1 ) :
stats [ ' total ' ] + = 1
logger . debug ( f " 📝 Processing article { i } / { total_entries } " )
# Skip entries without links or published dates
# Skip entries without links or published dates
if not hasattr ( entry , " link " ) or not hasattr ( entry , " published_parsed " ) :
if not hasattr ( entry , " link " ) :
logger . debug ( f " ⏩ Skipping entry { i } : no link " )
stats [ ' skipped ' ] + = 1
continue
continue
article_id = entry . link
if not hasattr ( entry , " published_parsed " ) :
logger . debug ( f " ⏩ Skipping entry { i } : no published date " ) # TODO: change back to 0.5
stats [ ' skipped ' ] + = 1
continue
article_url = entry . link
logger . debug ( f " 🔗 Processing article: { article_url } " )
# Parse the published date
# Parse the published date
try :
try :
@@ -327,39 +522,80 @@ class NewsFetcher:
* entry . published_parsed [ : 6 ] ,
* entry . published_parsed [ : 6 ] ,
tzinfo = timezone . utc
tzinfo = timezone . utc
)
)
except ( TypeError , ValueError ) :
logger . debug ( f " 📅 Article published: { published } " )
# Skip entries with invalid dates
except ( TypeError , ValueError ) as e :
logger . debug ( f " ⏩ Skipping entry { i } : invalid date - { e } " )
stats [ ' skipped ' ] + = 1
continue
continue
# Check if article already exists
with db_manager . get_cursor ( ) as cursor :
cursor . execute ( " SELECT id FROM news WHERE url = ? " , ( article_url , ) )
if cursor . fetchone ( ) :
logger . debug ( f " ⏩ Skipping entry { i } : article already exists " )
stats [ ' skipped ' ] + = 1
continue
# Get article summary
# Get article summary
summary = await NewsFetcher . summarize_article ( client , entry . link )
logger . debug ( f " 🤖 Requesting summary for article { i } " )
# Extract title and description from RSS entry
rss_title = getattr ( entry , ' title ' , ' ' )
rss_description = getattr ( entry , ' description ' , ' ' ) or getattr ( entry , ' summary ' , ' ' )
summary = await NewsFetcher . summarize_article (
client ,
article_url ,
title = rss_title ,
description = rss_description
)
if not summary :
if not summary :
logger . warning ( f " ❌ Failed to get summary for article { i } : { article_url } " )
stats [ ' failed ' ] + = 1
continue
continue
published_timestamp = int ( published . timestamp ( ) )
# Handle source field - it can be a string or dict
source_value = entry . get ( " source " , feed_row [ " url " ] )
if isinstance ( source_value , dict ) :
source_title = source_value . get ( " title " , feed_row [ " url " ] )
else :
source_title = source_value if source_value else feed_row [ " url " ]
logger . debug ( f " 💾 Storing article in database " )
# Store in database
# Store in database
with db_manager . get_cursor ( ) as cursor :
try :
cursor. execute (
with db_manager . get_cursor ( ) as cursor :
"""
cursor . execute (
INSERT INTO news (
"""
id, title, summary_de, summary_en , published,
INSERT INTO news (title, description, url , published, country)
source, country, source_feed
VALUES (?, ?, ?, ?, ?)
""" ,
(
summary [ " title " ] ,
summary [ " summary_de " ] ,
article_url ,
published_timestamp ,
feed_row [ " country " ] ,
)
)
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (id) DO NOTHING
logger . info ( f " ✅ Successfully processed article { i } : { summary [ ' title ' ] [ : 50 ] } ... " )
""" ,
stats [ ' successful ' ] + = 1
(
article_id ,
except Exception as db_error :
summary [ " title " ] ,
logger . error ( f " ❌ Database error for article { i } : { db_error } " )
summary [ " summary_de " ] ,
stats [ ' failed ' ] + = 1
summary [ " summary_en " ] ,
continue
published . isoformat ( ) ,
entry . get ( " source " , { } ) . get ( " title " , feed_row [ " url " ] ) ,
feed_row [ " country " ] ,
feed_row [ " url " ] ,
)
)
except Exception as e :
except Exception as e :
print ( f " Error processing feed { feed_row [ ' url ' ] } : { e } " )
logger . error ( f " ❌ Error processing feed { feed_row [ ' url ' ] } : { type ( e ) . __name__ } : { e } " )
logger . info ( f " 📊 Feed processing complete: { stats [ ' successful ' ] } successful, { stats [ ' failed ' ] } failed, { stats [ ' skipped ' ] } skipped out of { stats [ ' total ' ] } total " )
return stats
# Initialize scheduler
# Initialize scheduler
@@ -370,7 +606,11 @@ scheduler.add_job(
hours = CRON_HOURS ,
hours = CRON_HOURS ,
id = " harvest "
id = " harvest "
)
)
print ( f " Starting scheduler with { CRON_HOURS } hours interval " )
scheduler . start ( )
scheduler . start ( )
print ( " Scheduler started " )
print ( f " Next run: { scheduler . get_job ( ' harvest ' ) . next_run_time } " )
# Pydantic models for API requests and responses
# Pydantic models for API requests and responses
@@ -414,7 +654,7 @@ class HoursResponse(BaseModel):
# Dependency for getting a database cursor
# Dependency for getting a database cursor
def get_db ( ) :
async def get_db ( ) :
"""
"""
Dependency that provides a database cursor.
Dependency that provides a database cursor.
@@ -445,14 +685,20 @@ async def get_news(
Returns:
Returns:
List of news articles matching the criteria
List of news articles matching the criteria
"""
"""
db . execute (
try :
"""
datetime . fromisoformat ( from_ )
SELECT * FROM news
datetime . fromisoformat ( to )
WHERE country=? AND published BETWEEN ? AND ?
except ValueError :
ORDER BY published DESC
raise HTTPException ( 400 , " Invalid date format " )
""" ,
finally :
( country , from_ , to )
db . execute (
)
"""
SELECT id, title, description, url, published, country, created_at FROM news
WHERE country=? AND published BETWEEN ? AND ?
ORDER BY published DESC
""" ,
( country , from_ , to )
)
return [ dict ( row ) for row in db . fetchall ( ) ]
return [ dict ( row ) for row in db . fetchall ( ) ]
@@ -622,9 +868,11 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
)
)
# Trigger sync in background
# Trigger sync in background
asyncio . create_task ( NewsFetcher . harvest_feeds ( ) )
try :
return { " status " : " triggered " }
task = asyncio . create_task ( NewsFetcher . harvest_feeds ( ) )
return { " status " : " triggered " , " task_id " : id ( task ) }
except Exception as e :
raise HTTPException ( 500 , f " Failed to trigger sync: { str ( e ) } " )
# Mount static frontend
# Mount static frontend
frontend_path = os . path . join (
frontend_path = os . path . join (