refactor: apply consistent formatting and improve code readability across backend modules
This commit is contained in:
@@ -3,14 +3,20 @@ import json
|
||||
import re
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, cast, Dict
|
||||
from typing import Dict, Optional, cast
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from backend.app.config import ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH, logger, LLM_MODEL, OLLAMA_HOST, \
|
||||
LLM_TIMEOUT_SECONDS
|
||||
from backend.app.config import (
|
||||
ARTICLE_FETCH_TIMEOUT,
|
||||
LLM_MODEL,
|
||||
LLM_TIMEOUT_SECONDS,
|
||||
MAX_ARTICLE_LENGTH,
|
||||
OLLAMA_HOST,
|
||||
logger,
|
||||
)
|
||||
from backend.app.database import db_manager
|
||||
from backend.app.models import ArticleSummary
|
||||
|
||||
@@ -22,7 +28,9 @@ class NewsFetcher:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
|
||||
async def fetch_article_content(
|
||||
client: httpx.AsyncClient,
|
||||
url: str) -> str:
|
||||
"""
|
||||
Fetch and extract the main content from an article URL.
|
||||
|
||||
@@ -51,7 +59,14 @@ class NewsFetcher:
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
|
||||
for element in soup(['script',
|
||||
'style',
|
||||
'nav',
|
||||
'header',
|
||||
'footer',
|
||||
'aside',
|
||||
'form',
|
||||
'button']):
|
||||
element.decompose()
|
||||
|
||||
content_selectors = [
|
||||
@@ -84,7 +99,8 @@ class NewsFetcher:
|
||||
if body:
|
||||
article_text = body.get_text(separator=' ', strip=True)
|
||||
|
||||
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
|
||||
article_text = re.sub(
|
||||
r'\s+', ' ', article_text) # Normalize whitespace
|
||||
article_text = article_text.strip()
|
||||
|
||||
# Limit length to avoid overwhelming the LLM
|
||||
@@ -97,14 +113,21 @@ class NewsFetcher:
|
||||
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
||||
return ""
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||
logger.warning(
|
||||
f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
|
||||
logger.warning(
|
||||
f"❌ Error fetching article content from {url}: {
|
||||
type(e).__name__}: {e}")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
|
||||
def build_prompt(
|
||||
url: str,
|
||||
title: str = "",
|
||||
description: str = "",
|
||||
content: str = "") -> str:
|
||||
"""
|
||||
Generate a prompt for the LLM to summarize an article.
|
||||
|
||||
@@ -124,10 +147,12 @@ class NewsFetcher:
|
||||
if description:
|
||||
context_info.append(f"RSS-Beschreibung: {description}")
|
||||
if content:
|
||||
content_preview = content[:500] + "..." if len(content) > 500 else content
|
||||
content_preview = content[:500] + \
|
||||
"..." if len(content) > 500 else content
|
||||
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
||||
|
||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||
context = "\n".join(
|
||||
context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||
|
||||
return (
|
||||
"### Aufgabe\n"
|
||||
@@ -171,9 +196,11 @@ class NewsFetcher:
|
||||
article_content = await NewsFetcher.fetch_article_content(client, url)
|
||||
|
||||
if not article_content:
|
||||
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
|
||||
logger.warning(
|
||||
f"⚠️ Could not fetch article content, using RSS data only")
|
||||
|
||||
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
|
||||
prompt = NewsFetcher.build_prompt(
|
||||
url, title, description, article_content)
|
||||
payload = {
|
||||
"model": LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
@@ -200,7 +227,8 @@ class NewsFetcher:
|
||||
|
||||
# Validate required fields
|
||||
required_fields = ["title", "description"]
|
||||
missing_fields = [field for field in required_fields if field not in summary_data]
|
||||
missing_fields = [
|
||||
field for field in required_fields if field not in summary_data]
|
||||
|
||||
if missing_fields:
|
||||
logger.warning(
|
||||
@@ -229,7 +257,9 @@ class NewsFetcher:
|
||||
logger.error(f"❌ HTTP error for {url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
|
||||
logger.error(
|
||||
f"❌ Unexpected error summarizing {url}: {
|
||||
type(e).__name__}: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
@@ -264,7 +294,9 @@ class NewsFetcher:
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
|
||||
logger.error(
|
||||
f"❌ Critical error during harvest: {
|
||||
type(e).__name__}: {e}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
@@ -289,14 +321,20 @@ class NewsFetcher:
|
||||
feed_data = feedparser.parse(feed_row["url"])
|
||||
|
||||
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
||||
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
|
||||
logger.warning(
|
||||
f"⚠️ Feed has parsing issues: {
|
||||
feed_row['url']}")
|
||||
if hasattr(feed_data, 'bozo_exception'):
|
||||
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
|
||||
logger.warning(
|
||||
f"⚠️ Feed exception: {
|
||||
feed_data.bozo_exception}")
|
||||
|
||||
total_entries = len(feed_data.entries)
|
||||
|
||||
if total_entries == 0:
|
||||
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
|
||||
logger.warning(
|
||||
f"⚠️ No entries found in feed: {
|
||||
feed_row['url']}")
|
||||
return stats
|
||||
|
||||
for i, entry in enumerate(feed_data.entries, 1):
|
||||
@@ -321,18 +359,23 @@ class NewsFetcher:
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Check if article already exists - use readonly connection for better concurrency
|
||||
# Check if article already exists - use readonly connection for
|
||||
# better concurrency
|
||||
try:
|
||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
|
||||
cursor.execute(
|
||||
"SELECT id FROM news WHERE url = ?", (article_url,))
|
||||
if cursor.fetchone():
|
||||
stats['skipped'] += 1
|
||||
continue
|
||||
except Exception as db_error:
|
||||
logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
||||
logger.warning(
|
||||
f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
||||
|
||||
rss_title = getattr(entry, 'title', '')
|
||||
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
|
||||
rss_description = getattr(
|
||||
entry, 'description', '') or getattr(
|
||||
entry, 'summary', '')
|
||||
|
||||
summary = await NewsFetcher.summarize_article(
|
||||
client,
|
||||
@@ -342,7 +385,8 @@ class NewsFetcher:
|
||||
)
|
||||
|
||||
if not summary:
|
||||
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
|
||||
logger.warning(
|
||||
f"❌ Failed to get summary for article {i}: {article_url}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
@@ -353,7 +397,7 @@ class NewsFetcher:
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT
|
||||
OR IGNORE INTO news
|
||||
OR IGNORE INTO news
|
||||
(title, description, url, published, country)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
@@ -369,13 +413,17 @@ class NewsFetcher:
|
||||
stats['successful'] += 1
|
||||
|
||||
except Exception as db_error:
|
||||
logger.error(f"❌ Database error for article {i}: {db_error}")
|
||||
logger.error(
|
||||
f"❌ Database error for article {i}: {db_error}")
|
||||
stats['failed'] += 1
|
||||
continue
|
||||
|
||||
await asyncio.sleep(0.01) # 10ms delay to yield control
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
|
||||
logger.error(
|
||||
f"❌ Error processing feed {
|
||||
feed_row['url']}: {
|
||||
type(e).__name__}: {e}")
|
||||
|
||||
return stats
|
||||
|
Reference in New Issue
Block a user