refactor: modularize database management, models, and services for better structure and maintainability
This commit is contained in:
2
backend/.gitignore
vendored
2
backend/.gitignore
vendored
@@ -56,3 +56,5 @@ logs/
|
||||
*.swo
|
||||
/owlynews.sqlite-shm
|
||||
/owlynews.sqlite-wal
|
||||
/owlynews.sqlite3-shm
|
||||
/owlynews.sqlite3-wal
|
||||
|
110
backend/app/config.py
Normal file
110
backend/app/config.py
Normal file
@@ -0,0 +1,110 @@
|
||||
from pathlib import Path
|
||||
import os
|
||||
import logging
|
||||
|
||||
DB_PATH = Path(os.getenv("DB_NAME", "owlynews.sqlite3"))
|
||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
MIN_CRON_HOURS = float(os.getenv("MIN_CRON_HOURS", 0.5))
|
||||
DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
|
||||
CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
|
||||
SYNC_COOLDOWN_MINUTES = int(os.getenv("SYNC_COOLDOWN_MINUTES", 30))
|
||||
LLM_MODEL = os.getenv("LLM_MODEL", "qwen2:7b-instruct-q4_K_M")
|
||||
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", 180))
|
||||
OLLAMA_API_TIMEOUT_SECONDS = int(os.getenv("OLLAMA_API_TIMEOUT_SECONDS", 10))
|
||||
ARTICLE_FETCH_TIMEOUT = int(os.getenv("ARTICLE_FETCH_TIMEOUT", 30))
|
||||
MAX_ARTICLE_LENGTH = int(os.getenv("MAX_ARTICLE_LENGTH", 5000))
|
||||
|
||||
frontend_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||
"frontend",
|
||||
"dist"
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.WARNING,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def update_constants_from_db(settings_dict):
|
||||
"""
|
||||
Update global constants with values from the database settings.
|
||||
Environment variables take precedence over database settings.
|
||||
|
||||
Args:
|
||||
settings_dict: Dictionary of settings from the database
|
||||
"""
|
||||
global OLLAMA_HOST, MIN_CRON_HOURS, CRON_HOURS, SYNC_COOLDOWN_MINUTES
|
||||
global LLM_MODEL, LLM_TIMEOUT_SECONDS, OLLAMA_API_TIMEOUT_SECONDS
|
||||
global ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH
|
||||
|
||||
if 'ollama_host' in settings_dict and os.getenv("OLLAMA_HOST") is None:
|
||||
OLLAMA_HOST = settings_dict['ollama_host']
|
||||
|
||||
if 'min_cron_hours' in settings_dict and os.getenv("MIN_CRON_HOURS") is None:
|
||||
try:
|
||||
MIN_CRON_HOURS = float(settings_dict['min_cron_hours'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid min_cron_hours value in DB: "
|
||||
f"{settings_dict['min_cron_hours']}"
|
||||
)
|
||||
|
||||
if 'cron_hours' in settings_dict and os.getenv("CRON_HOURS") is None:
|
||||
try:
|
||||
cron_hours_value = float(settings_dict['cron_hours'])
|
||||
CRON_HOURS = max(MIN_CRON_HOURS, cron_hours_value)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid cron_hours value in DB: "
|
||||
f"{settings_dict['cron_hours']}"
|
||||
)
|
||||
|
||||
if 'sync_cooldown_minutes' in settings_dict and os.getenv("SYNC_COOLDOWN_MINUTES") is None:
|
||||
try:
|
||||
SYNC_COOLDOWN_MINUTES = int(settings_dict['sync_cooldown_minutes'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid sync_cooldown_minutes value in DB: "
|
||||
f"{settings_dict['sync_cooldown_minutes']}"
|
||||
)
|
||||
|
||||
if 'llm_model' in settings_dict and os.getenv("LLM_MODEL") is None:
|
||||
LLM_MODEL = settings_dict['llm_model']
|
||||
|
||||
if 'llm_timeout_seconds' in settings_dict and os.getenv("LLM_TIMEOUT_SECONDS") is None:
|
||||
try:
|
||||
LLM_TIMEOUT_SECONDS = int(settings_dict['llm_timeout_seconds'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid llm_timeout_seconds value in DB: "
|
||||
f"{settings_dict['llm_timeout_seconds']}"
|
||||
)
|
||||
|
||||
if 'ollama_api_timeout_seconds' in settings_dict and os.getenv("OLLAMA_API_TIMEOUT_SECONDS") is None:
|
||||
try:
|
||||
OLLAMA_API_TIMEOUT_SECONDS = int(settings_dict['ollama_api_timeout_seconds'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid ollama_api_timeout_seconds value in DB: "
|
||||
f"{settings_dict['ollama_api_timeout_seconds']}"
|
||||
)
|
||||
|
||||
if 'article_fetch_timeout' in settings_dict and os.getenv("ARTICLE_FETCH_TIMEOUT") is None:
|
||||
try:
|
||||
ARTICLE_FETCH_TIMEOUT = int(settings_dict['article_fetch_timeout'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid article_fetch_timeout value in DB: "
|
||||
f"{settings_dict['article_fetch_timeout']}"
|
||||
)
|
||||
|
||||
if 'max_article_length' in settings_dict and os.getenv("MAX_ARTICLE_LENGTH") is None:
|
||||
try:
|
||||
MAX_ARTICLE_LENGTH = int(settings_dict['max_article_length'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid max_article_length value in DB: "
|
||||
f"{settings_dict['max_article_length']}"
|
||||
)
|
248
backend/app/database.py
Normal file
248
backend/app/database.py
Normal file
@@ -0,0 +1,248 @@
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Iterator
|
||||
|
||||
from backend.app.config import logger, DB_PATH, update_constants_from_db, OLLAMA_HOST, CRON_HOURS, MIN_CRON_HOURS, \
|
||||
SYNC_COOLDOWN_MINUTES, LLM_MODEL, LLM_TIMEOUT_SECONDS, OLLAMA_API_TIMEOUT_SECONDS, ARTICLE_FETCH_TIMEOUT, \
|
||||
MAX_ARTICLE_LENGTH
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""
|
||||
Manages database connections and operations for the application.
|
||||
Provides methods for initializing the database, executing queries,
|
||||
and managing transactions.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path):
|
||||
"""
|
||||
Initialize the database manager with the given database path.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database file
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._initialize_db()
|
||||
|
||||
def _initialize_db(self) -> None:
|
||||
"""
|
||||
Initialize the database by creating tables if they don't exist.
|
||||
Also seeds initial feeds from seed_feeds.json and settings from global constants.
|
||||
After initialization, updates global constants with values from the database.
|
||||
"""
|
||||
try:
|
||||
schema_file = Path(__file__).parent / "schema.sql"
|
||||
if not schema_file.exists():
|
||||
logger.error("❌ schema.sql not found")
|
||||
raise FileNotFoundError("schema.sql not found")
|
||||
|
||||
with open(schema_file, 'r', encoding='utf-8') as f:
|
||||
schema_sql = f.read()
|
||||
|
||||
with self.get_cursor() as cursor:
|
||||
statements = [stmt.strip() for stmt in schema_sql.split(';') if stmt.strip()]
|
||||
for statement in statements:
|
||||
cursor.execute(statement)
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM feeds")
|
||||
feed_count = cursor.fetchone()[0]
|
||||
|
||||
if feed_count == 0:
|
||||
self._seed_feeds(cursor)
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM settings")
|
||||
settings_count = cursor.fetchone()[0]
|
||||
|
||||
if settings_count == 0:
|
||||
self._seed_settings(cursor)
|
||||
|
||||
settings = self.get_all_settings()
|
||||
update_constants_from_db(settings)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to initialize database: {e}")
|
||||
raise
|
||||
|
||||
def get_all_settings(self) -> dict:
|
||||
"""
|
||||
Retrieve all settings from the database.
|
||||
|
||||
Returns:
|
||||
Dictionary of settings with key-value pairs
|
||||
"""
|
||||
settings = {}
|
||||
try:
|
||||
with self.get_cursor(readonly=True) as cursor:
|
||||
cursor.execute("SELECT key, val FROM settings")
|
||||
for row in cursor.fetchall():
|
||||
settings[row['key']] = row['val']
|
||||
return settings
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to retrieve settings from database: {e}")
|
||||
return {}
|
||||
|
||||
def _seed_feeds(self, cursor) -> None:
|
||||
"""
|
||||
Seed initial feeds from seed_feeds.json file.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
seed_file = Path(__file__).parent / "seed_feeds.json"
|
||||
|
||||
if not seed_file.exists():
|
||||
logger.warning("⚠️ seed_feeds.json not found, skipping feed seeding")
|
||||
return
|
||||
|
||||
with open(seed_file, 'r', encoding='utf-8') as f:
|
||||
feeds_data = json.load(f)
|
||||
|
||||
for country, urls in feeds_data.items():
|
||||
for url in urls:
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO feeds (country, url) VALUES (?, ?)",
|
||||
(country, url)
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to seed feeds: {e}")
|
||||
|
||||
def _seed_settings(self, cursor) -> None:
|
||||
"""
|
||||
Seed initial settings from global constants.
|
||||
"""
|
||||
try:
|
||||
settings_data = {
|
||||
'ollama_host': OLLAMA_HOST,
|
||||
'min_cron_hours': MIN_CRON_HOURS,
|
||||
'cron_hours': CRON_HOURS,
|
||||
'sync_cooldown_minutes': SYNC_COOLDOWN_MINUTES,
|
||||
'llm_model': LLM_MODEL,
|
||||
'llm_timeout_seconds': LLM_TIMEOUT_SECONDS,
|
||||
'ollama_api_timeout_seconds': OLLAMA_API_TIMEOUT_SECONDS,
|
||||
'article_fetch_timeout': ARTICLE_FETCH_TIMEOUT,
|
||||
'max_article_length': MAX_ARTICLE_LENGTH
|
||||
}
|
||||
|
||||
for key, val in settings_data.items():
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO settings (key, val) VALUES (?, ?)",
|
||||
(key, str(val))
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to seed settings: {e}")
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""
|
||||
Create a thread-safe database connection.
|
||||
|
||||
Returns:
|
||||
An active SQLite connection
|
||||
"""
|
||||
conn = sqlite3.connect(
|
||||
self.db_path,
|
||||
check_same_thread=False,
|
||||
timeout=30.0
|
||||
)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA busy_timeout=30000")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA temp_store=MEMORY")
|
||||
return conn
|
||||
|
||||
@contextmanager
|
||||
def get_cursor(self, readonly: bool = False) -> Iterator[sqlite3.Cursor]:
|
||||
"""
|
||||
Context manager that provides a database cursor and handles commits and rollbacks.
|
||||
|
||||
Args:
|
||||
readonly: If True, opens connection in readonly mode for better concurrency
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
conn = None
|
||||
try:
|
||||
conn = self._get_connection()
|
||||
|
||||
if readonly:
|
||||
conn.execute("BEGIN DEFERRED")
|
||||
|
||||
cursor = conn.cursor()
|
||||
yield cursor
|
||||
|
||||
if not readonly:
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError as e:
|
||||
if conn:
|
||||
conn.rollback()
|
||||
if "database is locked" in str(e).lower():
|
||||
logger.warning(
|
||||
f"⚠️ Database temporarily locked, operation may need retry: {e}"
|
||||
)
|
||||
raise e
|
||||
except Exception as e:
|
||||
if conn:
|
||||
conn.rollback()
|
||||
raise e
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
@contextmanager
|
||||
def get_cursor_with_retry(self, readonly: bool = False, max_retries: int = 3) -> Iterator[sqlite3.Cursor]:
|
||||
"""
|
||||
Context manager with retry logic for database operations.
|
||||
|
||||
Args:
|
||||
readonly: If True, opens connection in readonly mode
|
||||
max_retries: Maximum number of retry attempts
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
with self.get_cursor(readonly=readonly) as cursor:
|
||||
yield cursor
|
||||
return
|
||||
except sqlite3.OperationalError as e:
|
||||
if "database is locked" in str(e).lower() and attempt < max_retries:
|
||||
wait_time = (attempt + 1) * 0.1
|
||||
logger.warning(
|
||||
f"⚠️ Database locked, retrying in {wait_time}s "
|
||||
f"(attempt {attempt + 1}/{max_retries + 1})"
|
||||
)
|
||||
import time
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
raise e
|
||||
|
||||
|
||||
db_manager = DatabaseManager(DB_PATH)
|
||||
|
||||
|
||||
async def get_db():
|
||||
"""
|
||||
Dependency that provides a database cursor with retry logic.
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||
yield cursor
|
||||
|
||||
|
||||
async def get_db_write():
|
||||
"""
|
||||
Dependency that provides a database cursor for write operations with retry logic.
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
with db_manager.get_cursor_with_retry(readonly=False) as cursor:
|
||||
yield cursor
|
@@ -10,126 +10,25 @@ The application uses SQLite for data storage and APScheduler for scheduling peri
|
||||
|
||||
# Standard library imports
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from http.client import HTTPException
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict, Union, cast
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
# Third-party imports
|
||||
import feedparser
|
||||
import httpx
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from bs4 import BeautifulSoup
|
||||
from fastapi import Depends, FastAPI, Response, status
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
DB_PATH = Path(os.getenv("DB_NAME", "owlynews.sqlite3"))
|
||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
MIN_CRON_HOURS = float(os.getenv("MIN_CRON_HOURS", 0.5))
|
||||
DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
|
||||
CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
|
||||
SYNC_COOLDOWN_MINUTES = int(os.getenv("SYNC_COOLDOWN_MINUTES", 30))
|
||||
LLM_MODEL = os.getenv("LLM_MODEL", "qwen2:7b-instruct-q4_K_M")
|
||||
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", 180))
|
||||
OLLAMA_API_TIMEOUT_SECONDS = int(os.getenv("OLLAMA_API_TIMEOUT_SECONDS", 10))
|
||||
ARTICLE_FETCH_TIMEOUT = int(os.getenv("ARTICLE_FETCH_TIMEOUT", 30))
|
||||
MAX_ARTICLE_LENGTH = int(os.getenv("MAX_ARTICLE_LENGTH", 5000))
|
||||
|
||||
def update_constants_from_db(settings_dict):
|
||||
"""
|
||||
Update global constants with values from the database settings.
|
||||
Environment variables take precedence over database settings.
|
||||
|
||||
Args:
|
||||
settings_dict: Dictionary of settings from the database
|
||||
"""
|
||||
global OLLAMA_HOST, MIN_CRON_HOURS, CRON_HOURS, SYNC_COOLDOWN_MINUTES
|
||||
global LLM_MODEL, LLM_TIMEOUT_SECONDS, OLLAMA_API_TIMEOUT_SECONDS
|
||||
global ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH
|
||||
|
||||
if 'ollama_host' in settings_dict and os.getenv("OLLAMA_HOST") is None:
|
||||
OLLAMA_HOST = settings_dict['ollama_host']
|
||||
|
||||
if 'min_cron_hours' in settings_dict and os.getenv("MIN_CRON_HOURS") is None:
|
||||
try:
|
||||
MIN_CRON_HOURS = float(settings_dict['min_cron_hours'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid min_cron_hours value in DB: "
|
||||
f"{settings_dict['min_cron_hours']}"
|
||||
)
|
||||
|
||||
if 'cron_hours' in settings_dict and os.getenv("CRON_HOURS") is None:
|
||||
try:
|
||||
cron_hours_value = float(settings_dict['cron_hours'])
|
||||
CRON_HOURS = max(MIN_CRON_HOURS, cron_hours_value)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid cron_hours value in DB: "
|
||||
f"{settings_dict['cron_hours']}"
|
||||
)
|
||||
|
||||
if 'sync_cooldown_minutes' in settings_dict and os.getenv("SYNC_COOLDOWN_MINUTES") is None:
|
||||
try:
|
||||
SYNC_COOLDOWN_MINUTES = int(settings_dict['sync_cooldown_minutes'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid sync_cooldown_minutes value in DB: "
|
||||
f"{settings_dict['sync_cooldown_minutes']}"
|
||||
)
|
||||
|
||||
if 'llm_model' in settings_dict and os.getenv("LLM_MODEL") is None:
|
||||
LLM_MODEL = settings_dict['llm_model']
|
||||
|
||||
if 'llm_timeout_seconds' in settings_dict and os.getenv("LLM_TIMEOUT_SECONDS") is None:
|
||||
try:
|
||||
LLM_TIMEOUT_SECONDS = int(settings_dict['llm_timeout_seconds'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid llm_timeout_seconds value in DB: "
|
||||
f"{settings_dict['llm_timeout_seconds']}"
|
||||
)
|
||||
|
||||
if 'ollama_api_timeout_seconds' in settings_dict and os.getenv("OLLAMA_API_TIMEOUT_SECONDS") is None:
|
||||
try:
|
||||
OLLAMA_API_TIMEOUT_SECONDS = int(settings_dict['ollama_api_timeout_seconds'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid ollama_api_timeout_seconds value in DB: "
|
||||
f"{settings_dict['ollama_api_timeout_seconds']}"
|
||||
)
|
||||
|
||||
if 'article_fetch_timeout' in settings_dict and os.getenv("ARTICLE_FETCH_TIMEOUT") is None:
|
||||
try:
|
||||
ARTICLE_FETCH_TIMEOUT = int(settings_dict['article_fetch_timeout'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid article_fetch_timeout value in DB: "
|
||||
f"{settings_dict['article_fetch_timeout']}"
|
||||
)
|
||||
|
||||
if 'max_article_length' in settings_dict and os.getenv("MAX_ARTICLE_LENGTH") is None:
|
||||
try:
|
||||
MAX_ARTICLE_LENGTH = int(settings_dict['max_article_length'])
|
||||
except (ValueError, TypeError):
|
||||
logger.warning(
|
||||
f"⚠️ Invalid max_article_length value in DB: "
|
||||
f"{settings_dict['max_article_length']}"
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.WARNING,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
from backend.app.config import logger, OLLAMA_HOST, CRON_HOURS, MIN_CRON_HOURS, \
|
||||
SYNC_COOLDOWN_MINUTES, LLM_MODEL, OLLAMA_API_TIMEOUT_SECONDS, frontend_path
|
||||
from backend.app.database import get_db, get_db_write
|
||||
from backend.app.models import TimestampResponse, SuccessResponse, FeedData, ModelStatus, ErrorResponse, HoursResponse, \
|
||||
CronSettings
|
||||
from backend.app.services import NewsFetcher
|
||||
|
||||
app = FastAPI(
|
||||
title="Owly News Summariser",
|
||||
@@ -137,597 +36,6 @@ app = FastAPI(
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""
|
||||
Manages database connections and operations for the application.
|
||||
Provides methods for initializing the database, executing queries,
|
||||
and managing transactions.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path):
|
||||
"""
|
||||
Initialize the database manager with the given database path.
|
||||
|
||||
Args:
|
||||
db_path: Path to the SQLite database file
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self._initialize_db()
|
||||
|
||||
def _initialize_db(self) -> None:
|
||||
"""
|
||||
Initialize the database by creating tables if they don't exist.
|
||||
Also seeds initial feeds from seed_feeds.json and settings from global constants.
|
||||
After initialization, updates global constants with values from the database.
|
||||
"""
|
||||
try:
|
||||
schema_file = Path(__file__).parent / "schema.sql"
|
||||
if not schema_file.exists():
|
||||
logger.error("❌ schema.sql not found")
|
||||
raise FileNotFoundError("schema.sql not found")
|
||||
|
||||
with open(schema_file, 'r', encoding='utf-8') as f:
|
||||
schema_sql = f.read()
|
||||
|
||||
with self.get_cursor() as cursor:
|
||||
statements = [stmt.strip() for stmt in schema_sql.split(';') if stmt.strip()]
|
||||
for statement in statements:
|
||||
cursor.execute(statement)
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM feeds")
|
||||
feed_count = cursor.fetchone()[0]
|
||||
|
||||
if feed_count == 0:
|
||||
self._seed_feeds(cursor)
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM settings")
|
||||
settings_count = cursor.fetchone()[0]
|
||||
|
||||
if settings_count == 0:
|
||||
self._seed_settings(cursor)
|
||||
|
||||
settings = self.get_all_settings()
|
||||
update_constants_from_db(settings)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to initialize database: {e}")
|
||||
raise
|
||||
|
||||
def get_all_settings(self) -> dict:
|
||||
"""
|
||||
Retrieve all settings from the database.
|
||||
|
||||
Returns:
|
||||
Dictionary of settings with key-value pairs
|
||||
"""
|
||||
settings = {}
|
||||
try:
|
||||
with self.get_cursor(readonly=True) as cursor:
|
||||
cursor.execute("SELECT key, val FROM settings")
|
||||
for row in cursor.fetchall():
|
||||
settings[row['key']] = row['val']
|
||||
return settings
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to retrieve settings from database: {e}")
|
||||
return {}
|
||||
|
||||
def _seed_feeds(self, cursor) -> None:
|
||||
"""
|
||||
Seed initial feeds from seed_feeds.json file.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
seed_file = Path(__file__).parent / "seed_feeds.json"
|
||||
|
||||
if not seed_file.exists():
|
||||
logger.warning("⚠️ seed_feeds.json not found, skipping feed seeding")
|
||||
return
|
||||
|
||||
with open(seed_file, 'r', encoding='utf-8') as f:
|
||||
feeds_data = json.load(f)
|
||||
|
||||
for country, urls in feeds_data.items():
|
||||
for url in urls:
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO feeds (country, url) VALUES (?, ?)",
|
||||
(country, url)
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to seed feeds: {e}")
|
||||
|
||||
def _seed_settings(self, cursor) -> None:
|
||||
"""
|
||||
Seed initial settings from global constants.
|
||||
"""
|
||||
try:
|
||||
settings_data = {
|
||||
'ollama_host': OLLAMA_HOST,
|
||||
'min_cron_hours': MIN_CRON_HOURS,
|
||||
'cron_hours': CRON_HOURS,
|
||||
'sync_cooldown_minutes': SYNC_COOLDOWN_MINUTES,
|
||||
'llm_model': LLM_MODEL,
|
||||
'llm_timeout_seconds': LLM_TIMEOUT_SECONDS,
|
||||
'ollama_api_timeout_seconds': OLLAMA_API_TIMEOUT_SECONDS,
|
||||
'article_fetch_timeout': ARTICLE_FETCH_TIMEOUT,
|
||||
'max_article_length': MAX_ARTICLE_LENGTH
|
||||
}
|
||||
|
||||
for key, val in settings_data.items():
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO settings (key, val) VALUES (?, ?)",
|
||||
(key, str(val))
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to seed settings: {e}")
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""
|
||||
Create a thread-safe database connection.
|
||||
|
||||
Returns:
|
||||
An active SQLite connection
|
||||
"""
|
||||
conn = sqlite3.connect(
|
||||
self.db_path,
|
||||
check_same_thread=False,
|
||||
timeout=30.0
|
||||
)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA busy_timeout=30000")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA temp_store=MEMORY")
|
||||
return conn
|
||||
|
||||
@contextmanager
|
||||
def get_cursor(self, readonly: bool = False) -> Iterator[sqlite3.Cursor]:
|
||||
"""
|
||||
Context manager that provides a database cursor and handles commits and rollbacks.
|
||||
|
||||
Args:
|
||||
readonly: If True, opens connection in readonly mode for better concurrency
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
conn = None
|
||||
try:
|
||||
conn = self._get_connection()
|
||||
|
||||
if readonly:
|
||||
conn.execute("BEGIN DEFERRED")
|
||||
|
||||
cursor = conn.cursor()
|
||||
yield cursor
|
||||
|
||||
if not readonly:
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError as e:
|
||||
if conn:
|
||||
conn.rollback()
|
||||
if "database is locked" in str(e).lower():
|
||||
logger.warning(
|
||||
f"⚠️ Database temporarily locked, operation may need retry: {e}"
|
||||
)
|
||||
raise e
|
||||
except Exception as e:
|
||||
if conn:
|
||||
conn.rollback()
|
||||
raise e
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
@contextmanager
|
||||
def get_cursor_with_retry(self, readonly: bool = False, max_retries: int = 3) -> Iterator[sqlite3.Cursor]:
|
||||
"""
|
||||
Context manager with retry logic for database operations.
|
||||
|
||||
Args:
|
||||
readonly: If True, opens connection in readonly mode
|
||||
max_retries: Maximum number of retry attempts
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
with self.get_cursor(readonly=readonly) as cursor:
|
||||
yield cursor
|
||||
return
|
||||
except sqlite3.OperationalError as e:
|
||||
if "database is locked" in str(e).lower() and attempt < max_retries:
|
||||
wait_time = (attempt + 1) * 0.1
|
||||
logger.warning(
|
||||
f"⚠️ Database locked, retrying in {wait_time}s "
|
||||
f"(attempt {attempt + 1}/{max_retries + 1})"
|
||||
)
|
||||
import time
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
raise e
|
||||
|
||||
|
||||
db_manager = DatabaseManager(DB_PATH)
|
||||
|
||||
|
||||
class ArticleSummary(TypedDict):
|
||||
"""Type definition for article summary data returned from the LLM."""
|
||||
title: str
|
||||
summary_de: str
|
||||
summary_en: str
|
||||
|
||||
|
||||
class NewsFetcher:
|
||||
"""
|
||||
Handles fetching and summarizing news articles from RSS feeds.
|
||||
Uses Ollama/qwen to generate summaries of articles.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
|
||||
"""
|
||||
Fetch and extract the main content from an article URL.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
url: URL of the article to fetch
|
||||
|
||||
Returns:
|
||||
Extracted text content from the article, or empty string if failed
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
response = await client.get(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=ARTICLE_FETCH_TIMEOUT,
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
|
||||
element.decompose()
|
||||
|
||||
content_selectors = [
|
||||
'article',
|
||||
'[role="main"]',
|
||||
'.content',
|
||||
'.article-content',
|
||||
'.post-content',
|
||||
'.entry-content',
|
||||
'.main-content',
|
||||
'main',
|
||||
'.story-body',
|
||||
'.article-body'
|
||||
]
|
||||
|
||||
article_text = ""
|
||||
|
||||
for selector in content_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
for element in elements:
|
||||
text = element.get_text(separator=' ', strip=True)
|
||||
if len(text) > len(article_text):
|
||||
article_text = text
|
||||
break
|
||||
|
||||
# Fallback: get text from body if no specific content area found
|
||||
if not article_text:
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
article_text = body.get_text(separator=' ', strip=True)
|
||||
|
||||
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
|
||||
article_text = article_text.strip()
|
||||
|
||||
# Limit length to avoid overwhelming the LLM
|
||||
if len(article_text) > MAX_ARTICLE_LENGTH:
|
||||
article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
|
||||
|
||||
return article_text
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
||||
return ""
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
|
||||
"""
|
||||
Generate a prompt for the LLM to summarize an article.
|
||||
|
||||
Args:
|
||||
url: Public URL of the article to summarize
|
||||
title: Article title from RSS feed (optional)
|
||||
description: Article description from RSS feed (optional)
|
||||
content: Extracted article content (optional)
|
||||
|
||||
Returns:
|
||||
A formatted prompt string that instructs the LLM to generate
|
||||
a JSON response with title and summaries in German and English
|
||||
"""
|
||||
context_info = []
|
||||
if title:
|
||||
context_info.append(f"RSS-Titel: {title}")
|
||||
if description:
|
||||
context_info.append(f"RSS-Beschreibung: {description}")
|
||||
if content:
|
||||
content_preview = content[:500] + "..." if len(content) > 500 else content
|
||||
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
||||
|
||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||
|
||||
return (
|
||||
"### Aufgabe\n"
|
||||
f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n"
|
||||
f"URL: {url}\n"
|
||||
f"Verfügbare Informationen:\n{context}\n\n"
|
||||
"### Regeln\n"
|
||||
"1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
|
||||
"2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
|
||||
"3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
|
||||
"4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
|
||||
"5. Struktur: {\"title\":\"…\",\"description\":\"…\"}\n"
|
||||
"6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
|
||||
"7. description: Deutsche Zusammenfassung (zwischen 100 und 160 Wörter)\n"
|
||||
"8. Kein Text vor oder nach dem JSON\n\n"
|
||||
"### Ausgabe\n"
|
||||
"Jetzt antworte mit dem JSON:"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def summarize_article(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
title: str = "",
|
||||
description: str = ""
|
||||
) -> Optional[ArticleSummary]:
|
||||
"""
|
||||
Generate a summary of an article using the LLM.
|
||||
Now fetches the actual article content for more accurate summaries.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
url: URL of the article to summarize
|
||||
title: Article title from RSS feed
|
||||
description: Article description from RSS feed
|
||||
|
||||
Returns:
|
||||
A dictionary containing the article title and summaries in German and English,
|
||||
or None if summarization failed
|
||||
"""
|
||||
article_content = await NewsFetcher.fetch_article_content(client, url)
|
||||
|
||||
if not article_content:
|
||||
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
|
||||
|
||||
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
|
||||
payload = {
|
||||
"model": LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"temperature": 0.1,
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_HOST}/api/generate",
|
||||
json=payload,
|
||||
timeout=LLM_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
llm_response = result["response"]
|
||||
|
||||
if isinstance(llm_response, str):
|
||||
summary_data = json.loads(llm_response)
|
||||
else:
|
||||
summary_data = llm_response
|
||||
|
||||
# Validate required fields
|
||||
required_fields = ["title", "description"]
|
||||
missing_fields = [field for field in required_fields if field not in summary_data]
|
||||
|
||||
if missing_fields:
|
||||
logger.warning(
|
||||
f"⚠️ Missing required fields in summary: {missing_fields}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Check summary quality metrics
|
||||
description = len(summary_data.get("description", "").split())
|
||||
|
||||
if description > 160 or description < 100:
|
||||
logger.warning(
|
||||
f"⚠️ Summary exceeds word limit - "
|
||||
f"Description: {description}/160"
|
||||
)
|
||||
|
||||
return cast(ArticleSummary, summary_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"❌ JSON parsing error for {url}: {e}")
|
||||
logger.error(
|
||||
f"🔍 Raw response that failed to parse: {llm_response[:500]}..."
|
||||
)
|
||||
return None
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"❌ HTTP error for {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def harvest_feeds() -> None:
|
||||
"""
|
||||
Fetch articles from all feeds and store summaries in the database.
|
||||
This is the main function that runs periodically to update the news database.
|
||||
"""
|
||||
|
||||
total_articles = 0
|
||||
successful_articles = 0
|
||||
failed_articles = 0
|
||||
|
||||
try:
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute("SELECT country, url FROM feeds")
|
||||
feeds = cursor.fetchall()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
for i, feed_row in enumerate(feeds, 1):
|
||||
feed_stats = await NewsFetcher._process_feed(client, feed_row)
|
||||
|
||||
total_articles += feed_stats['total']
|
||||
successful_articles += feed_stats['successful']
|
||||
failed_articles += feed_stats['failed']
|
||||
|
||||
current_time = int(datetime.now(timezone.utc).timestamp())
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute(
|
||||
"UPDATE meta SET val=? WHERE key='last_sync'",
|
||||
(str(current_time),)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
async def _process_feed(
|
||||
client: httpx.AsyncClient,
|
||||
feed_row: sqlite3.Row
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Process a single feed, fetching and summarizing all articles.
|
||||
Now saves summaries immediately to the database with better concurrency.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
feed_row: A database row containing feed information
|
||||
|
||||
Returns:
|
||||
Dictionary with processing statistics
|
||||
"""
|
||||
stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0}
|
||||
|
||||
try:
|
||||
feed_data = feedparser.parse(feed_row["url"])
|
||||
|
||||
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
||||
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
|
||||
if hasattr(feed_data, 'bozo_exception'):
|
||||
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
|
||||
|
||||
total_entries = len(feed_data.entries)
|
||||
|
||||
if total_entries == 0:
|
||||
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
|
||||
return stats
|
||||
|
||||
for i, entry in enumerate(feed_data.entries, 1):
|
||||
stats['total'] += 1
|
||||
|
||||
if not hasattr(entry, "link"):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
if not hasattr(entry, "published_parsed"):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
article_url = entry.link
|
||||
|
||||
try:
|
||||
published = datetime(
|
||||
*entry.published_parsed[:6],
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if article already exists - use readonly connection for better concurrency
|
||||
try:
|
||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
|
||||
if cursor.fetchone():
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
except Exception as db_error:
|
||||
logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
||||
|
||||
rss_title = getattr(entry, 'title', '')
|
||||
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
|
||||
|
||||
summary = await NewsFetcher.summarize_article(
|
||||
client,
|
||||
article_url,
|
||||
title=rss_title,
|
||||
description=rss_description
|
||||
)
|
||||
|
||||
if not summary:
|
||||
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
published_timestamp = int(published.timestamp())
|
||||
|
||||
try:
|
||||
with db_manager.get_cursor_with_retry(readonly=False) as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR IGNORE INTO news
|
||||
(title, description, url, published, country)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
summary["title"],
|
||||
summary["description"],
|
||||
article_url,
|
||||
published_timestamp,
|
||||
feed_row["country"],
|
||||
)
|
||||
)
|
||||
|
||||
stats['successful'] += 1
|
||||
|
||||
except Exception as db_error:
|
||||
logger.error(f"❌ Database error for article {i}: {db_error}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
await asyncio.sleep(0.01) # 10ms delay to yield control
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
scheduler = AsyncIOScheduler(timezone="UTC")
|
||||
scheduler.add_job(
|
||||
NewsFetcher.harvest_feeds,
|
||||
@@ -738,74 +46,13 @@ scheduler.add_job(
|
||||
scheduler.start()
|
||||
|
||||
|
||||
# Pydantic models for API requests and responses
|
||||
class CronSettings(BaseModel):
|
||||
"""Settings for the cron job that harvests news."""
|
||||
hours: float
|
||||
|
||||
|
||||
class FeedData(BaseModel):
|
||||
"""Data for a news feed."""
|
||||
country: str
|
||||
url: str
|
||||
|
||||
|
||||
class ModelStatus(BaseModel):
|
||||
"""Status of the LLM model."""
|
||||
name: str
|
||||
status: str
|
||||
available_models: List[str]
|
||||
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
"""Standard error response."""
|
||||
status: str
|
||||
message: str
|
||||
|
||||
|
||||
class SuccessResponse(BaseModel):
|
||||
"""Standard success response."""
|
||||
status: str
|
||||
|
||||
|
||||
class TimestampResponse(BaseModel):
|
||||
"""Response containing a timestamp."""
|
||||
ts: int
|
||||
|
||||
|
||||
class HoursResponse(BaseModel):
|
||||
"""Response containing hours setting."""
|
||||
hours: float
|
||||
|
||||
|
||||
async def get_db():
|
||||
"""
|
||||
Dependency that provides a database cursor with retry logic.
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||
yield cursor
|
||||
|
||||
async def get_db_write():
|
||||
"""
|
||||
Dependency that provides a database cursor for write operations with retry logic.
|
||||
|
||||
Yields:
|
||||
A database cursor for executing SQL statements
|
||||
"""
|
||||
with db_manager.get_cursor_with_retry(readonly=False) as cursor:
|
||||
yield cursor
|
||||
|
||||
|
||||
# API endpoints
|
||||
@app.get("/news", response_model=List[Dict[str, Any]])
|
||||
async def get_news(
|
||||
country: str = "DE",
|
||||
from_: str = "2025-07-01",
|
||||
to_: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"),
|
||||
db: sqlite3.Cursor = Depends(get_db)
|
||||
country: str = "DE",
|
||||
from_: str = "2025-07-01",
|
||||
to_: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"),
|
||||
db: sqlite3.Cursor = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get news articles filtered by country and date range.
|
||||
@@ -829,11 +76,11 @@ async def get_news(
|
||||
|
||||
db.execute(
|
||||
"""
|
||||
SELECT id, title, description, url, published, country, created_at
|
||||
SELECT id, title, description, url, published, country, created_at
|
||||
FROM news
|
||||
WHERE country = ? AND published BETWEEN ? AND ?
|
||||
ORDER BY published DESC
|
||||
LIMIT 1000
|
||||
WHERE country = ?
|
||||
AND published BETWEEN ? AND ?
|
||||
ORDER BY published DESC LIMIT 1000
|
||||
""",
|
||||
(country, from_ts, to_ts)
|
||||
)
|
||||
@@ -891,8 +138,8 @@ async def get_last_sync(db: sqlite3.Cursor = Depends(get_db)):
|
||||
|
||||
@app.post("/feeds", response_model=SuccessResponse)
|
||||
async def add_feed(
|
||||
feed: FeedData,
|
||||
db: sqlite3.Cursor = Depends(get_db_write)
|
||||
feed: FeedData,
|
||||
db: sqlite3.Cursor = Depends(get_db_write)
|
||||
):
|
||||
"""
|
||||
Add a new news feed.
|
||||
@@ -920,8 +167,8 @@ async def add_feed(
|
||||
|
||||
@app.delete("/feeds", response_model=SuccessResponse)
|
||||
async def delete_feed(
|
||||
url: str,
|
||||
db: sqlite3.Cursor = Depends(get_db_write)
|
||||
url: str,
|
||||
db: sqlite3.Cursor = Depends(get_db_write)
|
||||
):
|
||||
"""
|
||||
Delete a news feed by URL.
|
||||
@@ -1062,9 +309,4 @@ async def update_cron_schedule(data: CronSettings, db: sqlite3.Cursor = Depends(
|
||||
|
||||
|
||||
# Mount static frontend
|
||||
frontend_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||
"frontend",
|
||||
"dist"
|
||||
)
|
||||
app.mount("/", StaticFiles(directory=frontend_path, html=True), name="static")
|
||||
|
50
backend/app/models.py
Normal file
50
backend/app/models.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from typing import TypedDict, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ArticleSummary(TypedDict):
|
||||
"""Type definition for article summary data returned from the LLM."""
|
||||
title: str
|
||||
summary_de: str
|
||||
summary_en: str
|
||||
|
||||
|
||||
# Pydantic models for API requests and responses
|
||||
class CronSettings(BaseModel):
|
||||
"""Settings for the cron job that harvests news."""
|
||||
hours: float
|
||||
|
||||
|
||||
class FeedData(BaseModel):
|
||||
"""Data for a news feed."""
|
||||
country: str
|
||||
url: str
|
||||
|
||||
|
||||
class ModelStatus(BaseModel):
|
||||
"""Status of the LLM model."""
|
||||
name: str
|
||||
status: str
|
||||
available_models: List[str]
|
||||
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
"""Standard error response."""
|
||||
status: str
|
||||
message: str
|
||||
|
||||
|
||||
class SuccessResponse(BaseModel):
|
||||
"""Standard success response."""
|
||||
status: str
|
||||
|
||||
|
||||
class TimestampResponse(BaseModel):
|
||||
"""Response containing a timestamp."""
|
||||
ts: int
|
||||
|
||||
|
||||
class HoursResponse(BaseModel):
|
||||
"""Response containing hours setting."""
|
||||
hours: float
|
381
backend/app/services.py
Normal file
381
backend/app/services.py
Normal file
@@ -0,0 +1,381 @@
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, cast, Dict
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from backend.app.config import ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH, logger, LLM_MODEL, OLLAMA_HOST, \
|
||||
LLM_TIMEOUT_SECONDS
|
||||
from backend.app.database import db_manager
|
||||
from backend.app.models import ArticleSummary
|
||||
|
||||
|
||||
class NewsFetcher:
|
||||
"""
|
||||
Handles fetching and summarizing news articles from RSS feeds.
|
||||
Uses Ollama/qwen to generate summaries of articles.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
|
||||
"""
|
||||
Fetch and extract the main content from an article URL.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
url: URL of the article to fetch
|
||||
|
||||
Returns:
|
||||
Extracted text content from the article, or empty string if failed
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
response = await client.get(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=ARTICLE_FETCH_TIMEOUT,
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
|
||||
element.decompose()
|
||||
|
||||
content_selectors = [
|
||||
'article',
|
||||
'[role="main"]',
|
||||
'.content',
|
||||
'.article-content',
|
||||
'.post-content',
|
||||
'.entry-content',
|
||||
'.main-content',
|
||||
'main',
|
||||
'.story-body',
|
||||
'.article-body'
|
||||
]
|
||||
|
||||
article_text = ""
|
||||
|
||||
for selector in content_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
for element in elements:
|
||||
text = element.get_text(separator=' ', strip=True)
|
||||
if len(text) > len(article_text):
|
||||
article_text = text
|
||||
break
|
||||
|
||||
# Fallback: get text from body if no specific content area found
|
||||
if not article_text:
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
article_text = body.get_text(separator=' ', strip=True)
|
||||
|
||||
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
|
||||
article_text = article_text.strip()
|
||||
|
||||
# Limit length to avoid overwhelming the LLM
|
||||
if len(article_text) > MAX_ARTICLE_LENGTH:
|
||||
article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
|
||||
|
||||
return article_text
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
||||
return ""
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
|
||||
"""
|
||||
Generate a prompt for the LLM to summarize an article.
|
||||
|
||||
Args:
|
||||
url: Public URL of the article to summarize
|
||||
title: Article title from RSS feed (optional)
|
||||
description: Article description from RSS feed (optional)
|
||||
content: Extracted article content (optional)
|
||||
|
||||
Returns:
|
||||
A formatted prompt string that instructs the LLM to generate
|
||||
a JSON response with title and summaries in German and English
|
||||
"""
|
||||
context_info = []
|
||||
if title:
|
||||
context_info.append(f"RSS-Titel: {title}")
|
||||
if description:
|
||||
context_info.append(f"RSS-Beschreibung: {description}")
|
||||
if content:
|
||||
content_preview = content[:500] + "..." if len(content) > 500 else content
|
||||
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
||||
|
||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||
|
||||
return (
|
||||
"### Aufgabe\n"
|
||||
f"Du sollst eine Nachricht basierend auf der URL und den verfügbaren Informationen zusammenfassen.\n"
|
||||
f"URL: {url}\n"
|
||||
f"Verfügbare Informationen:\n{context}\n\n"
|
||||
"### Regeln\n"
|
||||
"1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
|
||||
"2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
|
||||
"3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
|
||||
"4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
|
||||
"5. Struktur: {\"title\":\"…\",\"description\":\"…\"}\n"
|
||||
"6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
|
||||
"7. description: Deutsche Zusammenfassung (zwischen 100 und 160 Wörter)\n"
|
||||
"8. Kein Text vor oder nach dem JSON\n\n"
|
||||
"### Ausgabe\n"
|
||||
"Jetzt antworte mit dem JSON:"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def summarize_article(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
title: str = "",
|
||||
description: str = ""
|
||||
) -> Optional[ArticleSummary]:
|
||||
"""
|
||||
Generate a summary of an article using the LLM.
|
||||
Now fetches the actual article content for more accurate summaries.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
url: URL of the article to summarize
|
||||
title: Article title from RSS feed
|
||||
description: Article description from RSS feed
|
||||
|
||||
Returns:
|
||||
A dictionary containing the article title and summaries in German and English,
|
||||
or None if summarization failed
|
||||
"""
|
||||
article_content = await NewsFetcher.fetch_article_content(client, url)
|
||||
|
||||
if not article_content:
|
||||
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
|
||||
|
||||
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
|
||||
payload = {
|
||||
"model": LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"temperature": 0.1,
|
||||
"format": "json"
|
||||
}
|
||||
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_HOST}/api/generate",
|
||||
json=payload,
|
||||
timeout=LLM_TIMEOUT_SECONDS
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
llm_response = result["response"]
|
||||
|
||||
if isinstance(llm_response, str):
|
||||
summary_data = json.loads(llm_response)
|
||||
else:
|
||||
summary_data = llm_response
|
||||
|
||||
# Validate required fields
|
||||
required_fields = ["title", "description"]
|
||||
missing_fields = [field for field in required_fields if field not in summary_data]
|
||||
|
||||
if missing_fields:
|
||||
logger.warning(
|
||||
f"⚠️ Missing required fields in summary: {missing_fields}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Check summary quality metrics
|
||||
description = len(summary_data.get("description", "").split())
|
||||
|
||||
if description > 160 or description < 100:
|
||||
logger.warning(
|
||||
f"⚠️ Summary exceeds word limit - "
|
||||
f"Description: {description}/160"
|
||||
)
|
||||
|
||||
return cast(ArticleSummary, summary_data)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"❌ JSON parsing error for {url}: {e}")
|
||||
logger.error(
|
||||
f"🔍 Raw response that failed to parse: {llm_response[:500]}..."
|
||||
)
|
||||
return None
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"❌ HTTP error for {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def harvest_feeds() -> None:
|
||||
"""
|
||||
Fetch articles from all feeds and store summaries in the database.
|
||||
This is the main function that runs periodically to update the news database.
|
||||
"""
|
||||
|
||||
total_articles = 0
|
||||
successful_articles = 0
|
||||
failed_articles = 0
|
||||
|
||||
try:
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute("SELECT country, url FROM feeds")
|
||||
feeds = cursor.fetchall()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
for i, feed_row in enumerate(feeds, 1):
|
||||
feed_stats = await NewsFetcher._process_feed(client, feed_row)
|
||||
|
||||
total_articles += feed_stats['total']
|
||||
successful_articles += feed_stats['successful']
|
||||
failed_articles += feed_stats['failed']
|
||||
|
||||
current_time = int(datetime.now(timezone.utc).timestamp())
|
||||
with db_manager.get_cursor() as cursor:
|
||||
cursor.execute(
|
||||
"UPDATE meta SET val=? WHERE key='last_sync'",
|
||||
(str(current_time),)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
async def _process_feed(
|
||||
client: httpx.AsyncClient,
|
||||
feed_row: sqlite3.Row
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Process a single feed, fetching and summarizing all articles.
|
||||
Now saves summaries immediately to the database with better concurrency.
|
||||
|
||||
Args:
|
||||
client: An active httpx AsyncClient for making requests
|
||||
feed_row: A database row containing feed information
|
||||
|
||||
Returns:
|
||||
Dictionary with processing statistics
|
||||
"""
|
||||
stats = {'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0}
|
||||
|
||||
try:
|
||||
feed_data = feedparser.parse(feed_row["url"])
|
||||
|
||||
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
||||
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
|
||||
if hasattr(feed_data, 'bozo_exception'):
|
||||
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
|
||||
|
||||
total_entries = len(feed_data.entries)
|
||||
|
||||
if total_entries == 0:
|
||||
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
|
||||
return stats
|
||||
|
||||
for i, entry in enumerate(feed_data.entries, 1):
|
||||
stats['total'] += 1
|
||||
|
||||
if not hasattr(entry, "link"):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
if not hasattr(entry, "published_parsed"):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
article_url = entry.link
|
||||
|
||||
try:
|
||||
published = datetime(
|
||||
*entry.published_parsed[:6],
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
except (TypeError, ValueError):
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if article already exists - use readonly connection for better concurrency
|
||||
try:
|
||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
|
||||
if cursor.fetchone():
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
except Exception as db_error:
|
||||
logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
||||
|
||||
rss_title = getattr(entry, 'title', '')
|
||||
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
|
||||
|
||||
summary = await NewsFetcher.summarize_article(
|
||||
client,
|
||||
article_url,
|
||||
title=rss_title,
|
||||
description=rss_description
|
||||
)
|
||||
|
||||
if not summary:
|
||||
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
published_timestamp = int(published.timestamp())
|
||||
|
||||
try:
|
||||
with db_manager.get_cursor_with_retry(readonly=False) as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT
|
||||
OR IGNORE INTO news
|
||||
(title, description, url, published, country)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
summary["title"],
|
||||
summary["description"],
|
||||
article_url,
|
||||
published_timestamp,
|
||||
feed_row["country"],
|
||||
)
|
||||
)
|
||||
|
||||
stats['successful'] += 1
|
||||
|
||||
except Exception as db_error:
|
||||
logger.error(f"❌ Database error for article {i}: {db_error}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
await asyncio.sleep(0.01) # 10ms delay to yield control
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
|
||||
|
||||
return stats
|
Reference in New Issue
Block a user