first commit

2025-08-01 06:05:06 +02:00
commit e2c546527f
44 changed files with 11845 additions and 0 deletions
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -0,0 +1,56 @@
+# Environment variables
+.env
+.env.*
+!example.env
+
+# Database files
+*.sqlite
+*.sqlite3
+*.db
+
+# Python bytecode
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual environments
+venv/
+.venv/
+env/
+ENV/
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+*.egg
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Logs
+*.log
+logs/
+
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
--- a/backend/app/init.py
+++ b/backend/app/init.py
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -0,0 +1,635 @@
+"""
+Owly News Summariser Backend
+
+This module provides a FastAPI application that serves as the backend for the Owly News Summariser.
+It handles fetching news from RSS feeds, summarizing articles using Ollama/qwen, and providing
+an API for the frontend to access the summarized news.
+
+The application uses SQLite for data storage and APScheduler for scheduling periodic news harvesting.
+"""
+
+import asyncio
+import json
+import os
+import sqlite3
+from contextlib import contextmanager
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Union, Iterator, Tuple, TypedDict, cast
+
+import feedparser
+import httpx
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from fastapi import FastAPI, Response, status, Depends
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+
+# Constants
+DB_PATH = Path("owlynews.sqlite")
+OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+MIN_CRON_HOURS = 0.5
+DEFAULT_CRON_HOURS = float(os.getenv("CRON_HOURS", MIN_CRON_HOURS))
+CRON_HOURS = max(MIN_CRON_HOURS, DEFAULT_CRON_HOURS)
+SYNC_COOLDOWN_MINUTES = 30
+LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
+LLM_TIMEOUT_SECONDS = 180
+OLLAMA_API_TIMEOUT_SECONDS = 10
+
+# FastAPI app initialization
+app = FastAPI(
+    title="Owly News Summariser",
+    description="API for the Owly News Summariser application",
+    version="1.0.0"
+)
+
+# Database schema definitions
+SCHEMA_SQL = [
+    """
+    CREATE TABLE IF NOT EXISTS news (
+        id          TEXT PRIMARY KEY,    -- e.g. URL as unique identifier
+        title       TEXT NOT NULL,
+        summary_de  TEXT,
+        summary_en  TEXT,
+        published   INTEGER,             -- Unix epoch (UTC); use TEXT ISO-8601 if you prefer
+        source      TEXT,
+        country     TEXT,
+        source_feed TEXT
+    )
+    """,
+    "CREATE INDEX IF NOT EXISTS idx_news_published ON news(published)",
+    """
+    CREATE TABLE IF NOT EXISTS feeds (
+         id      INTEGER PRIMARY KEY,     -- auto-increment via rowid
+         country TEXT,
+         url     TEXT UNIQUE NOT NULL
+    )
+    """,
+    """
+    CREATE TABLE IF NOT EXISTS settings (
+        key TEXT PRIMARY KEY,
+        val TEXT NOT NULL
+    )
+    """,
+    """
+    CREATE TABLE IF NOT EXISTS meta (
+        key TEXT PRIMARY KEY,
+        val TEXT NOT NULL
+    )
+    """
+]
+
+
+class DatabaseManager:
+    """
+    Manages database connections and operations for the application.
+    Provides methods for initializing the database, executing queries,
+    and managing transactions.
+    """
+
+    def __init__(self, db_path: Path):
+        """
+        Initialize the database manager with the given database path.
+
+        Args:
+            db_path: Path to the SQLite database file
+        """
+        self.db_path = db_path
+        self._connection = None
+        self._initialize_db()
+
+    def _get_connection(self) -> sqlite3.Connection:
+        """
+        Get or create a database connection.
+
+        Returns:
+            An active SQLite connection
+        """
+        if self._connection is None:
+            self._connection = sqlite3.connect(
+                self.db_path,
+                check_same_thread=False
+            )
+            self._connection.row_factory = sqlite3.Row
+        return self._connection
+
+    @contextmanager
+    def get_cursor(self) -> Iterator[sqlite3.Cursor]:
+        """
+        Context manager that provides a database cursor and handles commits and rollbacks.
+
+        Yields:
+            A database cursor for executing SQL statements
+
+        Example:
+            ```python
+            with db_manager.get_cursor() as cursor:
+                cursor.execute("SELECT * FROM table")
+                results = cursor.fetchall()
+            ```
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor()
+        try:
+            yield cursor
+            conn.commit()
+        except Exception:
+            conn.rollback()
+            raise
+
+    def _initialize_db(self) -> None:
+        """
+        Initialize the database schema and default settings.
+        Creates tables if they don't exist and inserts default values.
+        """
+        # Create schema
+        with self.get_cursor() as cursor:
+            for stmt in SCHEMA_SQL:
+                cursor.execute(stmt)
+
+            # Insert initial settings
+            cursor.execute(
+                "INSERT INTO settings VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
+                ("cron_hours", str(CRON_HOURS))
+            )
+
+            # Insert initial metadata
+            cursor.execute(
+                "INSERT INTO meta VALUES (?, ?) ON CONFLICT (key) DO NOTHING",
+                ("last_sync", "0")
+            )
+
+            # Seed feeds if none exist
+            cursor.execute("SELECT COUNT(*) as count FROM feeds")
+            if cursor.fetchone()["count"] == 0:
+                self._seed_feeds()
+
+    def _seed_feeds(self) -> None:
+        """
+        Seed the database with initial feeds from the seed_feeds.json file.
+        Only runs if the feeds table is empty.
+        """
+        try:
+            seed_path = Path(__file__).with_name("seed_feeds.json")
+            with open(seed_path, "r") as f:
+                seed_data = json.load(f)
+
+            with self.get_cursor() as cursor:
+                for country, urls in seed_data.items():
+                    for url in urls:
+                        cursor.execute(
+                            "INSERT INTO feeds (country, url) VALUES (?, ?) "
+                            "ON CONFLICT (url) DO NOTHING",
+                            (country, url)
+                        )
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            print(f"Error seeding feeds: {e}")
+
+
+# Initialize database manager
+db_manager = DatabaseManager(DB_PATH)
+
+
+class ArticleSummary(TypedDict):
+    """Type definition for article summary data returned from the LLM."""
+    title: str
+    summary_de: str
+    summary_en: str
+
+
+class NewsFetcher:
+    """
+    Handles fetching and summarizing news articles from RSS feeds.
+    Uses Ollama/qwen to generate summaries of articles.
+    """
+
+    @staticmethod
+    def build_prompt(url: str) -> str:
+        """
+        Generate a prompt for the LLM to summarize an article.
+
+        Args:
+            url: Public URL of the article to summarize
+
+        Returns:
+            A formatted prompt string that instructs the LLM to generate
+            a JSON response with title and summaries in German and English
+
+        Note:
+            LLMs like qwen2 don't have native web access; the model will
+            generate summaries based on its training data and the URL.
+        """
+        return (
+            "### Aufgabe\n"
+            f"Du bekommst eine öffentliche URL: {url}\n"
+            "### Regeln\n"
+            "1. **Entnimm den Inhalt nicht automatisch.** "
+            "Falls dir der Text nicht vorliegt, antworte mit leeren Strings.\n"
+            "2. Gib ausschließlich **gültiges minifiziertes JSON** zurück – "
+            "kein Markdown, keine Kommentare.\n"
+            "3. Struktur:\n"
+            "{\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
+            "4. summary_de ≤ 160 Wörter, summary_en ≤ 160 Wörter. Zähle selbst.\n"
+            "5. Kein Text vor oder nach dem JSON.\n"
+            "### Ausgabe\n"
+            "Jetzt antworte."
+        )
+
+    @staticmethod
+    async def summarize_article(
+        client: httpx.AsyncClient,
+        url: str
+    ) -> Optional[ArticleSummary]:
+        """
+        Generate a summary of an article using the LLM.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            url: URL of the article to summarize
+
+        Returns:
+            A dictionary containing the article title and summaries in German and English,
+            or None if summarization failed
+        """
+        prompt = NewsFetcher.build_prompt(url)
+        payload = {
+            "model": LLM_MODEL,
+            "prompt": prompt,
+            "stream": False,
+            "temperature": 0.2,
+            "format": "json"
+        }
+
+        try:
+            response = await client.post(
+                f"{OLLAMA_HOST}/api/generate",
+                json=payload,
+                timeout=LLM_TIMEOUT_SECONDS
+            )
+            response.raise_for_status()
+            result = response.json()
+            return cast(ArticleSummary, result["response"])
+        except (KeyError, ValueError, httpx.HTTPError, json.JSONDecodeError) as e:
+            print(f"Error summarizing article {url}: {e}")
+            return None
+
+    @staticmethod
+    async def harvest_feeds() -> None:
+        """
+        Fetch articles from all feeds and store summaries in the database.
+        This is the main function that runs periodically to update the news database.
+        """
+        try:
+            # Get all feeds from the database
+            with db_manager.get_cursor() as cursor:
+                cursor.execute("SELECT country, url FROM feeds")
+                feeds = cursor.fetchall()
+
+            # Process each feed
+            async with httpx.AsyncClient() as client:
+                for feed_row in feeds:
+                    await NewsFetcher._process_feed(client, feed_row)
+
+            # Update last sync timestamp
+            current_time = int(datetime.now(timezone.utc).timestamp())
+            with db_manager.get_cursor() as cursor:
+                cursor.execute(
+                    "UPDATE meta SET val=? WHERE key='last_sync'",
+                    (str(current_time),)
+                )
+        except Exception as e:
+            print(f"Error harvesting feeds: {e}")
+
+    @staticmethod
+    async def _process_feed(
+        client: httpx.AsyncClient,
+        feed_row: sqlite3.Row
+    ) -> None:
+        """
+        Process a single feed, fetching and summarizing all articles.
+
+        Args:
+            client: An active httpx AsyncClient for making requests
+            feed_row: A database row containing feed information
+        """
+        try:
+            feed_data = feedparser.parse(feed_row["url"])
+
+            for entry in feed_data.entries:
+                # Skip entries without links or published dates
+                if not hasattr(entry, "link") or not hasattr(entry, "published_parsed"):
+                    continue
+
+                article_id = entry.link
+
+                # Parse the published date
+                try:
+                    published = datetime(
+                        *entry.published_parsed[:6],
+                        tzinfo=timezone.utc
+                    )
+                except (TypeError, ValueError):
+                    # Skip entries with invalid dates
+                    continue
+
+                # Get article summary
+                summary = await NewsFetcher.summarize_article(client, entry.link)
+                if not summary:
+                    continue
+
+                # Store in database
+                with db_manager.get_cursor() as cursor:
+                    cursor.execute(
+                        """
+                        INSERT INTO news (
+                            id, title, summary_de, summary_en, published,
+                            source, country, source_feed
+                        )
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                        ON CONFLICT (id) DO NOTHING
+                        """,
+                        (
+                            article_id,
+                            summary["title"],
+                            summary["summary_de"],
+                            summary["summary_en"],
+                            published.isoformat(),
+                            entry.get("source", {}).get("title", feed_row["url"]),
+                            feed_row["country"],
+                            feed_row["url"],
+                        )
+                    )
+        except Exception as e:
+            print(f"Error processing feed {feed_row['url']}: {e}")
+
+
+# Initialize scheduler
+scheduler = AsyncIOScheduler(timezone="UTC")
+scheduler.add_job(
+    NewsFetcher.harvest_feeds,
+    "interval",
+    hours=CRON_HOURS,
+    id="harvest"
+)
+scheduler.start()
+
+
+# Pydantic models for API requests and responses
+class CronSettings(BaseModel):
+    """Settings for the cron job that harvests news."""
+    hours: float
+
+
+class FeedData(BaseModel):
+    """Data for a news feed."""
+    country: str
+    url: str
+
+
+class ModelStatus(BaseModel):
+    """Status of the LLM model."""
+    name: str
+    status: str
+    available_models: List[str]
+
+
+class ErrorResponse(BaseModel):
+    """Standard error response."""
+    status: str
+    message: str
+
+
+class SuccessResponse(BaseModel):
+    """Standard success response."""
+    status: str
+
+
+class TimestampResponse(BaseModel):
+    """Response containing a timestamp."""
+    ts: int
+
+
+class HoursResponse(BaseModel):
+    """Response containing hours setting."""
+    hours: float
+
+
+# Dependency for getting a database cursor
+def get_db():
+    """
+    Dependency that provides a database cursor.
+
+    Yields:
+        A database cursor for executing SQL statements
+    """
+    with db_manager.get_cursor() as cursor:
+        yield cursor
+
+
+# API endpoints
+@app.get("/news", response_model=List[Dict[str, Any]])
+async def get_news(
+    country: str = "DE",
+    from_: str = "2025-07-01",
+    to: str = datetime.now(timezone.utc).strftime("%Y-%m-%d"),
+    db: sqlite3.Cursor = Depends(get_db)
+):
+    """
+    Get news articles filtered by country and date range.
+
+    Args:
+        country: Country code to filter by (default: "DE")
+        from_: Start date in ISO format (default: "2025-07-01")
+        to: End date in ISO format (default: current date)
+        db: Database cursor dependency
+
+    Returns:
+        List of news articles matching the criteria
+    """
+    db.execute(
+        """
+        SELECT * FROM news
+        WHERE country=? AND published BETWEEN ? AND ?
+        ORDER BY published DESC
+        """,
+        (country, from_, to)
+    )
+    return [dict(row) for row in db.fetchall()]
+
+
+@app.get("/meta/last-sync", response_model=TimestampResponse)
+async def get_last_sync(db: sqlite3.Cursor = Depends(get_db)):
+    """
+    Get the timestamp of the last successful feed synchronization.
+
+    Args:
+        db: Database cursor dependency
+
+    Returns:
+        Object containing the timestamp as a Unix epoch
+    """
+    db.execute("SELECT val FROM meta WHERE key='last_sync'")
+    row = db.fetchone()
+    return {"ts": int(row["val"])}
+
+
+@app.put("/settings/cron", response_model=HoursResponse)
+async def set_cron_schedule(
+    data: CronSettings,
+    db: sqlite3.Cursor = Depends(get_db)
+):
+    """
+    Update the cron schedule for harvesting news.
+
+    Args:
+        data: New cron settings with hours interval
+        db: Database cursor dependency
+
+    Returns:
+        Object containing the updated hours setting
+    """
+    # Ensure minimum interval
+    hours = max(MIN_CRON_HOURS, data.hours)
+
+    # Update scheduler
+    scheduler.get_job("harvest").modify(trigger="interval", hours=hours)
+
+    # Update database
+    db.execute(
+        "UPDATE settings SET val=? WHERE key='cron_hours'",
+        (str(hours),)
+    )
+
+    return {"hours": hours}
+
+
+@app.get("/feeds", response_model=List[Dict[str, Any]])
+async def list_feeds(db: sqlite3.Cursor = Depends(get_db)):
+    """
+    List all registered news feeds.
+
+    Args:
+        db: Database cursor dependency
+
+    Returns:
+        List of feed objects with id, country, and url
+    """
+    db.execute("SELECT * FROM feeds ORDER BY country")
+    return [dict(row) for row in db.fetchall()]
+
+
+@app.post("/feeds", response_model=SuccessResponse)
+async def add_feed(
+    feed: FeedData,
+    db: sqlite3.Cursor = Depends(get_db)
+):
+    """
+    Add a new news feed.
+
+    Args:
+        feed: Feed data with country and URL
+        db: Database cursor dependency
+
+    Returns:
+        Success status
+    """
+    db.execute(
+        "INSERT INTO feeds (country, url) VALUES (?, ?) "
+        "ON CONFLICT (url) DO NOTHING",
+        (feed.country, feed.url)
+    )
+
+    return {"status": "added"}
+
+
+@app.delete("/feeds", response_model=SuccessResponse)
+async def delete_feed(
+    url: str,
+    db: sqlite3.Cursor = Depends(get_db)
+):
+    """
+    Delete a news feed by URL.
+
+    Args:
+        url: URL of the feed to delete
+        db: Database cursor dependency
+
+    Returns:
+        Success status
+    """
+    db.execute("DELETE FROM feeds WHERE url=?", (url,))
+    return {"status": "deleted"}
+
+
+@app.get("/model/status", response_model=Union[ModelStatus, ErrorResponse])
+async def get_model_status():
+    """
+    Check the status of the LLM model.
+
+    Returns:
+        Object containing model name, status, and available models,
+        or an error response if the model service is unavailable
+    """
+    try:
+        async with httpx.AsyncClient() as client:
+            # Get model information from Ollama
+            response = await client.get(
+                f"{OLLAMA_HOST}/api/tags",
+                timeout=OLLAMA_API_TIMEOUT_SECONDS
+            )
+            response.raise_for_status()
+
+            models_data = response.json()
+            models = models_data.get("models", [])
+
+            # Check if the current model is available
+            model_available = any(
+                model.get("name") == LLM_MODEL for model in models
+            )
+
+            return {
+                "name": LLM_MODEL,
+                "status": "ready" if model_available else "not available",
+                "available_models": [model.get("name") for model in models]
+            }
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+
+
+@app.post("/sync", response_model=None)
+async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
+    """
+    Manually trigger a feed synchronization.
+
+    Args:
+        db: Database cursor dependency
+
+    Returns:
+        Success status or error response if sync was triggered too recently
+    """
+    # Check when the last sync was performed
+    db.execute("SELECT val FROM meta WHERE key='last_sync'")
+    row = db.fetchone()
+    last_sync_ts = int(row["val"])
+
+    # Enforce cooldown period
+    now = datetime.now(timezone.utc)
+    last_sync_time = datetime.fromtimestamp(last_sync_ts, timezone.utc)
+
+    if now - last_sync_time < timedelta(minutes=SYNC_COOLDOWN_MINUTES):
+        return Response(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            content=f"Sync too soon – wait {SYNC_COOLDOWN_MINUTES} min."
+        )
+
+    # Trigger sync in background
+    asyncio.create_task(NewsFetcher.harvest_feeds())
+    return {"status": "triggered"}
+
+
+# Mount static frontend
+frontend_path = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+    "frontend",
+    "dist"
+)
+app.mount("/", StaticFiles(directory=frontend_path, html=True), name="static")
--- a/backend/app/seed_feeds.json
+++ b/backend/app/seed_feeds.json
@@ -0,0 +1,9 @@
+{
+  "DE": [
+    "https://www.tagesschau.de/xml/rss2",
+    "https://www.spiegel.de/schlagzeilen/tops/index.rss"
+  ],
+  "EU": [
+    "https://www.euronews.com/rss?level=theme&name=news"
+  ]
+}
--- a/backend/example.env
+++ b/backend/example.env
@@ -0,0 +1,8 @@
+# URL for the Ollama service
+OLLAMA_HOST=http://localhost:11434
+
+# Interval for scheduled news fetching in hours (minimum: 0.5)
+CRON_HOURS=1
+
+# SQLite database connection string
+DATABASE_URL=sqlite:///./newsdb.sqlite
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,10 @@
+aiofiles
+apscheduler
+fastapi
+feedparser
+httpx
+pydantic
+uvicorn[standard]
+python-multipart
+psycopg2-binary
+sqlalchemy