first commit

This commit is contained in:
2025-08-01 06:05:06 +02:00
commit e2c546527f
44 changed files with 11845 additions and 0 deletions

56
backend/.gitignore vendored Normal file
View File

@@ -0,0 +1,56 @@
# Environment variables
.env
.env.*
!example.env
# Database files
*.sqlite
*.sqlite3
*.db
# Python bytecode
__pycache__/
*.py[cod]
*$py.class
# Virtual environments
venv/
.venv/
env/
ENV/
# Distribution / packaging
dist/
build/
*.egg-info/
*.egg
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Jupyter Notebook
.ipynb_checkpoints
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Logs
*.log
logs/
# IDE specific files
.idea/
.vscode/
*.swp
*.swo

0
backend/app/__init__.py Normal file
View File

635
backend/app/main.py Normal file
View File

@@ -0,0 +1,635 @@
"""
Owly News Summariser Backend
This module provides a FastAPI application that serves as the backend for the Owly News Summariser.
It handles fetching news from RSS feeds, summarizing articles using Ollama/qwen, and providing
an API for the frontend to access the summarized news.
The application uses SQLite for data storage and APScheduler for scheduling periodic news harvesting.
"""
import asyncio
import json
import os
import sqlite3
from contextlib import contextmanager
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast
import feedparser
import httpx
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from fastapi import FastAPI, Response, status, Depends
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
# Constants
DB_PATH = Path("owlynews.sqlite")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
MIN_CRON_HOURS = 0.5
DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
SYNC_COOLDOWN_MINUTES = 30
LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
LLM_TIMEOUT_SECONDS = 180
OLLAMA_API_TIMEOUT_SECONDS = 10
# FastAPI app initialization
app = FastAPI(
title="Owly News Summariser",
description="API for the Owly News Summariser application",
version="1.0.0"
)
# Database schema definitions
SCHEMA_SQL = [
"""
CREATE TABLE IF NOT EXISTS news (
id TEXT PRIMARY KEY, -- e.g. URL as unique identifier
title TEXT NOT NULL,
summary_de TEXT,
summary_en TEXT,
published INTEGER, -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer
source TEXT,
country TEXT,
source_feed TEXT
)
""",
"CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)",
"""
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY, -- auto-increment via rowid
country TEXT,
url TEXT UNIQUE NOT NULL
)
""",
"""
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
val TEXT NOT NULL
)
""",
"""
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
val TEXT NOT NULL
)
"""
]
class DatabaseManager:
"""
Manages database connections and operations for the application.
Provides methods for initializing the database, executing queries,
and managing transactions.
"""
def __init__(self, db_path: Path):
"""
Initialize the database manager with the given database path.
Args:
db_path: Path to the SQLite database file
"""
self.db_path = db_path
self._connection = None
self._initialize_db()
def _get_connection(self) -> sqlite3.Connection:
"""
Get or create a database connection.
Returns:
An active SQLite connection
"""
if self._connection is None:
self._connection = sqlite3.connect(
self.db_path,
check_same_thread=False
)
self._connection.row_factory = sqlite3.Row
return self._connection
@contextmanager
def get_cursor(self) -> Iterator[sqlite3.Cursor]:
"""
Context manager that provides a database cursor and handles commits and rollbacks.
Yields:
A database cursor for executing SQL statements
Example:
```python
with db_manager.get_cursor() as cursor:
cursor.execute("SELECT * FROM table")
results = cursor.fetchall()
```
"""
conn = self._get_connection()
cursor = conn.cursor()
try:
yield cursor
conn.commit()
except Exception:
conn.rollback()
raise
def _initialize_db(self) -> None:
"""
Initialize the database schema and default settings.
Creates tables if they don't exist and inserts default values.
"""
# Create schema
with self.get_cursor() as cursor:
for stmt in SCHEMA_SQL:
cursor.execute(stmt)
# Insert initial settings
cursor.execute(
"INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
("cron_hours", str(CRON_HOURS))
)
# Insert initial metadata
cursor.execute(
"INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
("last_sync", "0")
)
# Seed feeds if none exist
cursor.execute("SELECT COUNT(*) as count FROM feeds")
if cursor.fetchone()["count"] == 0:
self._seed_feeds()
def _seed_feeds(self) -> None:
"""
Seed the database with initial feeds from the seed_feeds.json file.
Only runs if the feeds table is empty.
"""
try:
seed_path = Path(__file__).with_name("seed_feeds.json")
with open(seed_path, "r") as f:
seed_data = json.load(f)
with self.get_cursor() as cursor:
for country, urls in seed_data.items():
for url in urls:
cursor.execute(
"INSERT INTO feeds (country, url) VALUES (?, ?) "
"ON CONFLICT (url) DO NOTHING",
(country, url)
)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Error seeding feeds: {e}")
# Initialize database manager
db_manager = DatabaseManager(DB_PATH)
class ArticleSummary(TypedDict):
"""Type definition for article summary data returned from the LLM."""
title: str
summary_de: str
summary_en: str
class NewsFetcher:
"""
Handles fetching and summarizing news articles from RSS feeds.
Uses Ollama/qwen to generate summaries of articles.
"""
@staticmethod
def build_prompt(url: str) -> str:
"""
Generate a prompt for the LLM to summarize an article.
Args:
url: Public URL of the article to summarize
Returns:
A formatted prompt string that instructs the LLM to generate
a JSON response with title and summaries in German and English
Note:
LLMs like qwen2 don't have native web access; the model will
generate summaries based on its training data and the URL.
"""
return (
"### Aufgabe\n"
f"Du bekommst eine öffentliche URL: {url}\n"
"### Regeln\n"
"1. **Entnimm den Inhalt nicht automatisch.** "
"Falls dir der Text nicht vorliegt, antworte mit leeren Strings.\n"
"2. Gib ausschließlich **gültiges minifiziertes JSON** zurück "
"kein Markdown, keine Kommentare.\n"
"3. Struktur:\n"
"{\"title\":\"\",\"summary_de\":\"\",\"summary_en\":\"\"}\n"
"4. summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst.\n"
"5. Kein Text vor oder nach dem JSON.\n"
"### Ausgabe\n"
"Jetzt antworte."
)
@staticmethod
async def summarize_article(
client: httpx.AsyncClient,
url: str
) -> Optional[ArticleSummary]:
"""
Generate a summary of an article using the LLM.
Args:
client: An active httpx AsyncClient for making requests
url: URL of the article to summarize
Returns:
A dictionary containing the article title and summaries in German and English,
or None if summarization failed
"""
prompt = NewsFetcher.build_prompt(url)
payload = {
"model": LLM_MODEL,
"prompt": prompt,
"stream": False,
"temperature": 0.2,
"format": "json"
}
try:
response = await client.post(
f"{OLLAMA_HOST}/api/generate",
json=payload,
timeout=LLM_TIMEOUT_SECONDS
)
response.raise_for_status()
result = response.json()
return cast(ArticleSummary, result["response"])
except (KeyError, ValueError, httpx.HTTPError, json.JSONDecodeError) as e:
print(f"Error summarizing article {url}: {e}")
return None
@staticmethod
async def harvest_feeds() -> None:
"""
Fetch articles from all feeds and store summaries in the database.
This is the main function that runs periodically to update the news database.
"""
try:
# Get all feeds from the database
with db_manager.get_cursor() as cursor:
cursor.execute("SELECT country, url FROM feeds")
feeds = cursor.fetchall()
# Process each feed
async with httpx.AsyncClient() as client:
for feed_row in feeds:
await NewsFetcher._process_feed(client, feed_row)
# Update last sync timestamp
current_time = int(datetime.now(timezone.utc).timestamp())
with db_manager.get_cursor() as cursor:
cursor.execute(
"UPDATE meta SET val=? WHERE key='last_sync'",
(str(current_time),)
)
except Exception as e:
print(f"Error harvesting feeds: {e}")
@staticmethod
async def _process_feed(
client: httpx.AsyncClient,
feed_row: sqlite3.Row
) -> None:
"""
Process a single feed, fetching and summarizing all articles.
Args:
client: An active httpx AsyncClient for making requests
feed_row: A database row containing feed information
"""
try:
feed_data = feedparser.parse(feed_row["url"])
for entry in feed_data.entries:
# Skip entries without links or published dates
if not hasattr(entry, "link") or not hasattr(entry, "published_parsed"):
continue
article_id = entry.link
# Parse the published date
try:
published = datetime(
*entry.published_parsed[:6],
tzinfo=timezone.utc
)
except (TypeError, ValueError):
# Skip entries with invalid dates
continue
# Get article summary
summary = await NewsFetcher.summarize_article(client, entry.link)
if not summary:
continue
# Store in database
with db_manager.get_cursor() as cursor:
cursor.execute(
"""
INSERT INTO news (
id, title, summary_de, summary_en, published,
source, country, source_feed
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT (id) DO NOTHING
""",
(
article_id,
summary["title"],
summary["summary_de"],
summary["summary_en"],
published.isoformat(),
entry.get("source", {}).get("title", feed_row["url"]),
feed_row["country"],
feed_row["url"],
)
)
except Exception as e:
print(f"Error processing feed {feed_row['url']}: {e}")
# Initialize scheduler
scheduler = AsyncIOScheduler(timezone="UTC")
scheduler.add_job(
NewsFetcher.harvest_feeds,
"interval",
hours=CRON_HOURS,
id="harvest"
)
scheduler.start()
# Pydantic models for API requests and responses
class CronSettings(BaseModel):
"""Settings for the cron job that harvests news."""
hours: float
class FeedData(BaseModel):
"""Data for a news feed."""
country: str
url: str
class ModelStatus(BaseModel):
"""Status of the LLM model."""
name: str
status: str
available_models: List[str]
class ErrorResponse(BaseModel):
"""Standard error response."""
status: str
message: str
class SuccessResponse(BaseModel):
"""Standard success response."""
status: str
class TimestampResponse(BaseModel):
"""Response containing a timestamp."""
ts: int
class HoursResponse(BaseModel):
"""Response containing hours setting."""
hours: float
# Dependency for getting a database cursor
def get_db():
"""
Dependency that provides a database cursor.
Yields:
A database cursor for executing SQL statements
"""
with db_manager.get_cursor() as cursor:
yield cursor
# API endpoints
@app.get("/news", response_model=List[Dict[str, Any]])
async def get_news(
country: str = "DE",
from_: str = "2025-07-01",
to: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"),
db: sqlite3.Cursor = Depends(get_db)
):
"""
Get news articles filtered by country and date range.
Args:
country: Country code to filter by (default: "DE")
from_: Start date in ISO format (default: "2025-07-01")
to: End date in ISO format (default: current date)
db: Database cursor dependency
Returns:
List of news articles matching the criteria
"""
db.execute(
"""
SELECT * FROM news
WHERE country=? AND published BETWEEN ? AND ?
ORDER BY published DESC
""",
(country, from_, to)
)
return [dict(row) for row in db.fetchall()]
@app.get("/meta/last-sync", response_model=TimestampResponse)
async def get_last_sync(db: sqlite3.Cursor = Depends(get_db)):
"""
Get the timestamp of the last successful feed synchronization.
Args:
db: Database cursor dependency
Returns:
Object containing the timestamp as a Unix epoch
"""
db.execute("SELECT val FROM meta WHERE key='last_sync'")
row = db.fetchone()
return {"ts": int(row["val"])}
@app.put("/settings/cron", response_model=HoursResponse)
async def set_cron_schedule(
data: CronSettings,
db: sqlite3.Cursor = Depends(get_db)
):
"""
Update the cron schedule for harvesting news.
Args:
data: New cron settings with hours interval
db: Database cursor dependency
Returns:
Object containing the updated hours setting
"""
# Ensure minimum interval
hours = max(MIN_CRON_HOURS, data.hours)
# Update scheduler
scheduler.get_job("harvest").modify(trigger="interval", hours=hours)
# Update database
db.execute(
"UPDATE settings SET val=? WHERE key='cron_hours'",
(str(hours),)
)
return {"hours": hours}
@app.get("/feeds", response_model=List[Dict[str, Any]])
async def list_feeds(db: sqlite3.Cursor = Depends(get_db)):
"""
List all registered news feeds.
Args:
db: Database cursor dependency
Returns:
List of feed objects with id, country, and url
"""
db.execute("SELECT * FROM feeds ORDER BY country")
return [dict(row) for row in db.fetchall()]
@app.post("/feeds", response_model=SuccessResponse)
async def add_feed(
feed: FeedData,
db: sqlite3.Cursor = Depends(get_db)
):
"""
Add a new news feed.
Args:
feed: Feed data with country and URL
db: Database cursor dependency
Returns:
Success status
"""
db.execute(
"INSERT INTO feeds (country, url) VALUES (?, ?) "
"ON CONFLICT (url) DO NOTHING",
(feed.country, feed.url)
)
return {"status": "added"}
@app.delete("/feeds", response_model=SuccessResponse)
async def delete_feed(
url: str,
db: sqlite3.Cursor = Depends(get_db)
):
"""
Delete a news feed by URL.
Args:
url: URL of the feed to delete
db: Database cursor dependency
Returns:
Success status
"""
db.execute("DELETE FROM feeds WHERE url=?", (url,))
return {"status": "deleted"}
@app.get("/model/status", response_model=Union[ModelStatus, ErrorResponse])
async def get_model_status():
"""
Check the status of the LLM model.
Returns:
Object containing model name, status, and available models,
or an error response if the model service is unavailable
"""
try:
async with httpx.AsyncClient() as client:
# Get model information from Ollama
response = await client.get(
f"{OLLAMA_HOST}/api/tags",
timeout=OLLAMA_API_TIMEOUT_SECONDS
)
response.raise_for_status()
models_data = response.json()
models = models_data.get("models", [])
# Check if the current model is available
model_available = any(
model.get("name") == LLM_MODEL for model in models
)
return {
"name": LLM_MODEL,
"status": "ready" if model_available else "not available",
"available_models": [model.get("name") for model in models]
}
except Exception as e:
return {"status": "error", "message": str(e)}
@app.post("/sync", response_model=None)
async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
"""
Manually trigger a feed synchronization.
Args:
db: Database cursor dependency
Returns:
Success status or error response if sync was triggered too recently
"""
# Check when the last sync was performed
db.execute("SELECT val FROM meta WHERE key='last_sync'")
row = db.fetchone()
last_sync_ts = int(row["val"])
# Enforce cooldown period
now = datetime.now(timezone.utc)
last_sync_time = datetime.fromtimestamp(last_sync_ts, timezone.utc)
if now - last_sync_time < timedelta(minutes=SYNC_COOLDOWN_MINUTES):
return Response(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
content=f"Sync too soon wait {SYNC_COOLDOWN_MINUTES} min."
)
# Trigger sync in background
asyncio.create_task(NewsFetcher.harvest_feeds())
return {"status": "triggered"}
# Mount static frontend
frontend_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
"frontend",
"dist"
)
app.mount("/", StaticFiles(directory=frontend_path, html=True), name="static")

View File

@@ -0,0 +1,9 @@
{
"DE": [
"https://www.tagesschau.de/xml/rss2",
"https://www.spiegel.de/schlagzeilen/tops/index.rss"
],
"EU": [
"https://www.euronews.com/rss?level=theme&name=news"
]
}

8
backend/example.env Normal file
View File

@@ -0,0 +1,8 @@
# URL for the Ollama service
OLLAMA_HOST=http://localhost:11434
# Interval for scheduled news fetching in hours (minimum: 0.5)
CRON_HOURS=1
# SQLite database connection string
DATABASE_URL=sqlite:///./newsdb.sqlite

10
backend/requirements.txt Normal file
View File

@@ -0,0 +1,10 @@
aiofiles
apscheduler
fastapi
feedparser
httpx
pydantic
uvicorn[standard]
python-multipart
psycopg2-binary
sqlalchemy