refactor: apply consistent formatting and improve code readability across backend modules

This commit is contained in:
2025-08-01 22:51:38 +02:00
parent 0fd2c7a8b6
commit e1f51794af
5 changed files with 147 additions and 65 deletions

View File

@@ -3,14 +3,20 @@ import json
import re
import sqlite3
from datetime import datetime, timezone
from typing import Optional, cast, Dict
from typing import Dict, Optional, cast
import feedparser
import httpx
from bs4 import BeautifulSoup
from backend.app.config import ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH, logger, LLM_MODEL, OLLAMA_HOST, \
LLM_TIMEOUT_SECONDS
from backend.app.config import (
ARTICLE_FETCH_TIMEOUT,
LLM_MODEL,
LLM_TIMEOUT_SECONDS,
MAX_ARTICLE_LENGTH,
OLLAMA_HOST,
logger,
)
from backend.app.database import db_manager
from backend.app.models import ArticleSummary
@@ -22,7 +28,9 @@ class NewsFetcher:
"""
@staticmethod
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
async def fetch_article_content(
client: httpx.AsyncClient,
url: str) -> str:
"""
Fetch and extract the main content from an article URL.
@@ -51,7 +59,14 @@ class NewsFetcher:
soup = BeautifulSoup(response.text, 'html.parser')
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
for element in soup(['script',
'style',
'nav',
'header',
'footer',
'aside',
'form',
'button']):
element.decompose()
content_selectors = [
@@ -84,7 +99,8 @@ class NewsFetcher:
if body:
article_text = body.get_text(separator=' ', strip=True)
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
article_text = re.sub(
r'\s+', ' ', article_text) # Normalize whitespace
article_text = article_text.strip()
# Limit length to avoid overwhelming the LLM
@@ -97,14 +113,21 @@ class NewsFetcher:
logger.warning(f"⏰ Timeout fetching article content from: {url}")
return ""
except httpx.HTTPError as e:
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
logger.warning(
f"🌐 HTTP error fetching article content from {url}: {e}")
return ""
except Exception as e:
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
logger.warning(
f"❌ Error fetching article content from {url}: {
type(e).__name__}: {e}")
return ""
@staticmethod
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
def build_prompt(
url: str,
title: str = "",
description: str = "",
content: str = "") -> str:
"""
Generate a prompt for the LLM to summarize an article.
@@ -124,10 +147,12 @@ class NewsFetcher:
if description:
context_info.append(f"RSS-Beschreibung: {description}")
if content:
content_preview = content[:500] + "..." if len(content) > 500 else content
content_preview = content[:500] + \
"..." if len(content) > 500 else content
context_info.append(f"Artikel-Inhalt: {content_preview}")
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
context = "\n".join(
context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
return (
"### Aufgabe\n"
@@ -171,9 +196,11 @@ class NewsFetcher:
article_content = await NewsFetcher.fetch_article_content(client, url)
if not article_content:
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
logger.warning(
f"⚠️ Could not fetch article content, using RSS data only")
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
prompt = NewsFetcher.build_prompt(
url, title, description, article_content)
payload = {
"model": LLM_MODEL,
"prompt": prompt,
@@ -200,7 +227,8 @@ class NewsFetcher:
# Validate required fields
required_fields = ["title", "description"]
missing_fields = [field for field in required_fields if field not in summary_data]
missing_fields = [
field for field in required_fields if field not in summary_data]
if missing_fields:
logger.warning(
@@ -229,7 +257,9 @@ class NewsFetcher:
logger.error(f"❌ HTTP error for {url}: {e}")
return None
except Exception as e:
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
logger.error(
f"❌ Unexpected error summarizing {url}: {
type(e).__name__}: {e}")
return None
@staticmethod
@@ -264,7 +294,9 @@ class NewsFetcher:
)
except Exception as e:
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
logger.error(
f"❌ Critical error during harvest: {
type(e).__name__}: {e}")
raise
@staticmethod
@@ -289,14 +321,20 @@ class NewsFetcher:
feed_data = feedparser.parse(feed_row["url"])
if hasattr(feed_data, 'bozo') and feed_data.bozo:
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
logger.warning(
f"⚠️ Feed has parsing issues: {
feed_row['url']}")
if hasattr(feed_data, 'bozo_exception'):
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
logger.warning(
f"⚠️ Feed exception: {
feed_data.bozo_exception}")
total_entries = len(feed_data.entries)
if total_entries == 0:
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
logger.warning(
f"⚠️ No entries found in feed: {
feed_row['url']}")
return stats
for i, entry in enumerate(feed_data.entries, 1):
@@ -321,18 +359,23 @@ class NewsFetcher:
stats['skipped'] += 1
continue
# Check if article already exists - use readonly connection for better concurrency
# Check if article already exists - use readonly connection for
# better concurrency
try:
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
cursor.execute(
"SELECT id FROM news WHERE url = ?", (article_url,))
if cursor.fetchone():
stats['skipped'] += 1
continue
except Exception as db_error:
logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
logger.warning(
f"⚠️ Database check failed for article {i}, continuing: {db_error}")
rss_title = getattr(entry, 'title', '')
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
rss_description = getattr(
entry, 'description', '') or getattr(
entry, 'summary', '')
summary = await NewsFetcher.summarize_article(
client,
@@ -342,7 +385,8 @@ class NewsFetcher:
)
if not summary:
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
logger.warning(
f"❌ Failed to get summary for article {i}: {article_url}")
stats['failed'] += 1
continue
@@ -353,7 +397,7 @@ class NewsFetcher:
cursor.execute(
"""
INSERT
OR IGNORE INTO news
OR IGNORE INTO news
(title, description, url, published, country)
VALUES (?, ?, ?, ?, ?)
""",
@@ -369,13 +413,17 @@ class NewsFetcher:
stats['successful'] += 1
except Exception as db_error:
logger.error(f"❌ Database error for article {i}: {db_error}")
logger.error(
f"❌ Database error for article {i}: {db_error}")
stats['failed'] += 1
continue
await asyncio.sleep(0.01) # 10ms delay to yield control
except Exception as e:
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
logger.error(
f"❌ Error processing feed {
feed_row['url']}: {
type(e).__name__}: {e}")
return stats