refactor: apply consistent formatting and improve code readability across backend modules
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
from pathlib import Path
|
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
DB_PATH = Path(os.getenv("DB_NAME", "owlynews.sqlite3"))
|
DB_PATH = Path(os.getenv("DB_NAME", "owlynews.sqlite3"))
|
||||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||||
@@ -42,7 +42,8 @@ def update_constants_from_db(settings_dict):
|
|||||||
if 'ollama_host' in settings_dict and os.getenv("OLLAMA_HOST") is None:
|
if 'ollama_host' in settings_dict and os.getenv("OLLAMA_HOST") is None:
|
||||||
OLLAMA_HOST = settings_dict['ollama_host']
|
OLLAMA_HOST = settings_dict['ollama_host']
|
||||||
|
|
||||||
if 'min_cron_hours' in settings_dict and os.getenv("MIN_CRON_HOURS") is None:
|
if 'min_cron_hours' in settings_dict and os.getenv(
|
||||||
|
"MIN_CRON_HOURS") is None:
|
||||||
try:
|
try:
|
||||||
MIN_CRON_HOURS = float(settings_dict['min_cron_hours'])
|
MIN_CRON_HOURS = float(settings_dict['min_cron_hours'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
@@ -61,7 +62,8 @@ def update_constants_from_db(settings_dict):
|
|||||||
f"{settings_dict['cron_hours']}"
|
f"{settings_dict['cron_hours']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if 'sync_cooldown_minutes' in settings_dict and os.getenv("SYNC_COOLDOWN_MINUTES") is None:
|
if 'sync_cooldown_minutes' in settings_dict and os.getenv(
|
||||||
|
"SYNC_COOLDOWN_MINUTES") is None:
|
||||||
try:
|
try:
|
||||||
SYNC_COOLDOWN_MINUTES = int(settings_dict['sync_cooldown_minutes'])
|
SYNC_COOLDOWN_MINUTES = int(settings_dict['sync_cooldown_minutes'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
@@ -73,7 +75,8 @@ def update_constants_from_db(settings_dict):
|
|||||||
if 'llm_model' in settings_dict and os.getenv("LLM_MODEL") is None:
|
if 'llm_model' in settings_dict and os.getenv("LLM_MODEL") is None:
|
||||||
LLM_MODEL = settings_dict['llm_model']
|
LLM_MODEL = settings_dict['llm_model']
|
||||||
|
|
||||||
if 'llm_timeout_seconds' in settings_dict and os.getenv("LLM_TIMEOUT_SECONDS") is None:
|
if 'llm_timeout_seconds' in settings_dict and os.getenv(
|
||||||
|
"LLM_TIMEOUT_SECONDS") is None:
|
||||||
try:
|
try:
|
||||||
LLM_TIMEOUT_SECONDS = int(settings_dict['llm_timeout_seconds'])
|
LLM_TIMEOUT_SECONDS = int(settings_dict['llm_timeout_seconds'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
@@ -82,16 +85,19 @@ def update_constants_from_db(settings_dict):
|
|||||||
f"{settings_dict['llm_timeout_seconds']}"
|
f"{settings_dict['llm_timeout_seconds']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if 'ollama_api_timeout_seconds' in settings_dict and os.getenv("OLLAMA_API_TIMEOUT_SECONDS") is None:
|
if 'ollama_api_timeout_seconds' in settings_dict and os.getenv(
|
||||||
|
"OLLAMA_API_TIMEOUT_SECONDS") is None:
|
||||||
try:
|
try:
|
||||||
OLLAMA_API_TIMEOUT_SECONDS = int(settings_dict['ollama_api_timeout_seconds'])
|
OLLAMA_API_TIMEOUT_SECONDS = int(
|
||||||
|
settings_dict['ollama_api_timeout_seconds'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"⚠️ Invalid ollama_api_timeout_seconds value in DB: "
|
f"⚠️ Invalid ollama_api_timeout_seconds value in DB: "
|
||||||
f"{settings_dict['ollama_api_timeout_seconds']}"
|
f"{settings_dict['ollama_api_timeout_seconds']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if 'article_fetch_timeout' in settings_dict and os.getenv("ARTICLE_FETCH_TIMEOUT") is None:
|
if 'article_fetch_timeout' in settings_dict and os.getenv(
|
||||||
|
"ARTICLE_FETCH_TIMEOUT") is None:
|
||||||
try:
|
try:
|
||||||
ARTICLE_FETCH_TIMEOUT = int(settings_dict['article_fetch_timeout'])
|
ARTICLE_FETCH_TIMEOUT = int(settings_dict['article_fetch_timeout'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
@@ -100,7 +106,8 @@ def update_constants_from_db(settings_dict):
|
|||||||
f"{settings_dict['article_fetch_timeout']}"
|
f"{settings_dict['article_fetch_timeout']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if 'max_article_length' in settings_dict and os.getenv("MAX_ARTICLE_LENGTH") is None:
|
if 'max_article_length' in settings_dict and os.getenv(
|
||||||
|
"MAX_ARTICLE_LENGTH") is None:
|
||||||
try:
|
try:
|
||||||
MAX_ARTICLE_LENGTH = int(settings_dict['max_article_length'])
|
MAX_ARTICLE_LENGTH = int(settings_dict['max_article_length'])
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
|
@@ -1,11 +1,24 @@
|
|||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sqlite3
|
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
|
|
||||||
from backend.app.config import logger, DB_PATH, update_constants_from_db, OLLAMA_HOST, CRON_HOURS, MIN_CRON_HOURS, \
|
from backend.app.config import (
|
||||||
SYNC_COOLDOWN_MINUTES, LLM_MODEL, LLM_TIMEOUT_SECONDS, OLLAMA_API_TIMEOUT_SECONDS, ARTICLE_FETCH_TIMEOUT, \
|
ARTICLE_FETCH_TIMEOUT,
|
||||||
MAX_ARTICLE_LENGTH
|
CRON_HOURS,
|
||||||
|
DB_PATH,
|
||||||
|
LLM_MODEL,
|
||||||
|
LLM_TIMEOUT_SECONDS,
|
||||||
|
MAX_ARTICLE_LENGTH,
|
||||||
|
MIN_CRON_HOURS,
|
||||||
|
OLLAMA_API_TIMEOUT_SECONDS,
|
||||||
|
OLLAMA_HOST,
|
||||||
|
SYNC_COOLDOWN_MINUTES,
|
||||||
|
logger,
|
||||||
|
update_constants_from_db,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DatabaseManager:
|
class DatabaseManager:
|
||||||
@@ -41,7 +54,8 @@ class DatabaseManager:
|
|||||||
schema_sql = f.read()
|
schema_sql = f.read()
|
||||||
|
|
||||||
with self.get_cursor() as cursor:
|
with self.get_cursor() as cursor:
|
||||||
statements = [stmt.strip() for stmt in schema_sql.split(';') if stmt.strip()]
|
statements = [stmt.strip()
|
||||||
|
for stmt in schema_sql.split(';') if stmt.strip()]
|
||||||
for statement in statements:
|
for statement in statements:
|
||||||
cursor.execute(statement)
|
cursor.execute(statement)
|
||||||
|
|
||||||
@@ -85,14 +99,12 @@ class DatabaseManager:
|
|||||||
"""
|
"""
|
||||||
Seed initial feeds from seed_feeds.json file.
|
Seed initial feeds from seed_feeds.json file.
|
||||||
"""
|
"""
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
seed_file = Path(__file__).parent / "seed_feeds.json"
|
seed_file = Path(__file__).parent / "seed_feeds.json"
|
||||||
|
|
||||||
if not seed_file.exists():
|
if not seed_file.exists():
|
||||||
logger.warning("⚠️ seed_feeds.json not found, skipping feed seeding")
|
logger.warning(
|
||||||
|
"⚠️ seed_feeds.json not found, skipping feed seeding")
|
||||||
return
|
return
|
||||||
|
|
||||||
with open(seed_file, 'r', encoding='utf-8') as f:
|
with open(seed_file, 'r', encoding='utf-8') as f:
|
||||||
@@ -101,10 +113,7 @@ class DatabaseManager:
|
|||||||
for country, urls in feeds_data.items():
|
for country, urls in feeds_data.items():
|
||||||
for url in urls:
|
for url in urls:
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"INSERT OR IGNORE INTO feeds (country, url) VALUES (?, ?)",
|
"INSERT OR IGNORE INTO feeds (country, url) VALUES (?, ?)", (country, url))
|
||||||
(country, url)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Failed to seed feeds: {e}")
|
logger.error(f"❌ Failed to seed feeds: {e}")
|
||||||
@@ -182,8 +191,7 @@ class DatabaseManager:
|
|||||||
conn.rollback()
|
conn.rollback()
|
||||||
if "database is locked" in str(e).lower():
|
if "database is locked" in str(e).lower():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"⚠️ Database temporarily locked, operation may need retry: {e}"
|
f"⚠️ Database temporarily locked, operation may need retry: {e}")
|
||||||
)
|
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if conn:
|
if conn:
|
||||||
@@ -194,7 +202,9 @@ class DatabaseManager:
|
|||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def get_cursor_with_retry(self, readonly: bool = False, max_retries: int = 3) -> Iterator[sqlite3.Cursor]:
|
def get_cursor_with_retry(self,
|
||||||
|
readonly: bool = False,
|
||||||
|
max_retries: int = 3) -> Iterator[sqlite3.Cursor]:
|
||||||
"""
|
"""
|
||||||
Context manager with retry logic for database operations.
|
Context manager with retry logic for database operations.
|
||||||
|
|
||||||
@@ -211,13 +221,13 @@ class DatabaseManager:
|
|||||||
yield cursor
|
yield cursor
|
||||||
return
|
return
|
||||||
except sqlite3.OperationalError as e:
|
except sqlite3.OperationalError as e:
|
||||||
if "database is locked" in str(e).lower() and attempt < max_retries:
|
if "database is locked" in str(
|
||||||
|
e).lower() and attempt < max_retries:
|
||||||
wait_time = (attempt + 1) * 0.1
|
wait_time = (attempt + 1) * 0.1
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"⚠️ Database locked, retrying in {wait_time}s "
|
f"⚠️ Database locked, retrying in {wait_time}s "
|
||||||
f"(attempt {attempt + 1}/{max_retries + 1})"
|
f"(attempt {attempt + 1}/{max_retries + 1})"
|
||||||
)
|
)
|
||||||
import time
|
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
continue
|
continue
|
||||||
raise e
|
raise e
|
||||||
|
@@ -12,22 +12,37 @@ The application uses SQLite for data storage and APScheduler for scheduling peri
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import time
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
from http.client import HTTPException
|
|
||||||
from typing import Any, Dict, List, Union
|
from typing import Any, Dict, List, Union
|
||||||
|
|
||||||
# Third-party imports
|
# Third-party imports
|
||||||
import httpx
|
import httpx
|
||||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||||
from apscheduler.triggers.interval import IntervalTrigger
|
from apscheduler.triggers.interval import IntervalTrigger
|
||||||
from fastapi import Depends, FastAPI, Response, status
|
from fastapi import Depends, FastAPI, HTTPException, Response, status
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from backend.app.config import logger, OLLAMA_HOST, CRON_HOURS, MIN_CRON_HOURS, \
|
from backend.app.config import (
|
||||||
SYNC_COOLDOWN_MINUTES, LLM_MODEL, OLLAMA_API_TIMEOUT_SECONDS, frontend_path
|
CRON_HOURS,
|
||||||
|
LLM_MODEL,
|
||||||
|
MIN_CRON_HOURS,
|
||||||
|
OLLAMA_API_TIMEOUT_SECONDS,
|
||||||
|
OLLAMA_HOST,
|
||||||
|
SYNC_COOLDOWN_MINUTES,
|
||||||
|
frontend_path,
|
||||||
|
logger,
|
||||||
|
)
|
||||||
from backend.app.database import get_db, get_db_write
|
from backend.app.database import get_db, get_db_write
|
||||||
from backend.app.models import TimestampResponse, SuccessResponse, FeedData, ModelStatus, ErrorResponse, HoursResponse, \
|
from backend.app.models import (
|
||||||
CronSettings
|
CronSettings,
|
||||||
|
ErrorResponse,
|
||||||
|
FeedData,
|
||||||
|
HoursResponse,
|
||||||
|
ModelStatus,
|
||||||
|
SuccessResponse,
|
||||||
|
TimestampResponse,
|
||||||
|
)
|
||||||
from backend.app.services import NewsFetcher
|
from backend.app.services import NewsFetcher
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
@@ -88,7 +103,8 @@ async def get_news(
|
|||||||
return [dict(row) for row in db.fetchall()]
|
return [dict(row) for row in db.fetchall()]
|
||||||
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise HTTPException(400, "Invalid date format. Use ISO format (YYYY-MM-DD)")
|
raise HTTPException(
|
||||||
|
400, "Invalid date format. Use ISO format (YYYY-MM-DD)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Error fetching news: {e}")
|
logger.error(f"❌ Error fetching news: {e}")
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
@@ -244,8 +260,7 @@ async def manual_sync(db: sqlite3.Cursor = Depends(get_db)):
|
|||||||
if now - last_sync_time < timedelta(minutes=SYNC_COOLDOWN_MINUTES):
|
if now - last_sync_time < timedelta(minutes=SYNC_COOLDOWN_MINUTES):
|
||||||
return Response(
|
return Response(
|
||||||
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
||||||
content="Sync was triggered too recently. Please wait before triggering again."
|
content="Sync was triggered too recently. Please wait before triggering again.")
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
task = asyncio.create_task(NewsFetcher.harvest_feeds())
|
task = asyncio.create_task(NewsFetcher.harvest_feeds())
|
||||||
@@ -281,7 +296,9 @@ async def get_cron_schedule(db: sqlite3.Cursor = Depends(get_db)):
|
|||||||
|
|
||||||
|
|
||||||
@app.post("/settings/cron", response_model=HoursResponse)
|
@app.post("/settings/cron", response_model=HoursResponse)
|
||||||
async def update_cron_schedule(data: CronSettings, db: sqlite3.Cursor = Depends(get_db_write)):
|
async def update_cron_schedule(
|
||||||
|
data: CronSettings,
|
||||||
|
db: sqlite3.Cursor = Depends(get_db_write)):
|
||||||
"""
|
"""
|
||||||
Update the cron schedule for harvesting news.
|
Update the cron schedule for harvesting news.
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
from typing import TypedDict, List
|
from typing import List, TypedDict
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
@@ -3,14 +3,20 @@ import json
|
|||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Optional, cast, Dict
|
from typing import Dict, Optional, cast
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from backend.app.config import ARTICLE_FETCH_TIMEOUT, MAX_ARTICLE_LENGTH, logger, LLM_MODEL, OLLAMA_HOST, \
|
from backend.app.config import (
|
||||||
LLM_TIMEOUT_SECONDS
|
ARTICLE_FETCH_TIMEOUT,
|
||||||
|
LLM_MODEL,
|
||||||
|
LLM_TIMEOUT_SECONDS,
|
||||||
|
MAX_ARTICLE_LENGTH,
|
||||||
|
OLLAMA_HOST,
|
||||||
|
logger,
|
||||||
|
)
|
||||||
from backend.app.database import db_manager
|
from backend.app.database import db_manager
|
||||||
from backend.app.models import ArticleSummary
|
from backend.app.models import ArticleSummary
|
||||||
|
|
||||||
@@ -22,7 +28,9 @@ class NewsFetcher:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
|
async def fetch_article_content(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
url: str) -> str:
|
||||||
"""
|
"""
|
||||||
Fetch and extract the main content from an article URL.
|
Fetch and extract the main content from an article URL.
|
||||||
|
|
||||||
@@ -51,7 +59,14 @@ class NewsFetcher:
|
|||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
|
for element in soup(['script',
|
||||||
|
'style',
|
||||||
|
'nav',
|
||||||
|
'header',
|
||||||
|
'footer',
|
||||||
|
'aside',
|
||||||
|
'form',
|
||||||
|
'button']):
|
||||||
element.decompose()
|
element.decompose()
|
||||||
|
|
||||||
content_selectors = [
|
content_selectors = [
|
||||||
@@ -84,7 +99,8 @@ class NewsFetcher:
|
|||||||
if body:
|
if body:
|
||||||
article_text = body.get_text(separator=' ', strip=True)
|
article_text = body.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
|
article_text = re.sub(
|
||||||
|
r'\s+', ' ', article_text) # Normalize whitespace
|
||||||
article_text = article_text.strip()
|
article_text = article_text.strip()
|
||||||
|
|
||||||
# Limit length to avoid overwhelming the LLM
|
# Limit length to avoid overwhelming the LLM
|
||||||
@@ -97,14 +113,21 @@ class NewsFetcher:
|
|||||||
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
||||||
return ""
|
return ""
|
||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
|
logger.warning(
|
||||||
|
f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||||
return ""
|
return ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
|
logger.warning(
|
||||||
|
f"❌ Error fetching article content from {url}: {
|
||||||
|
type(e).__name__}: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
|
def build_prompt(
|
||||||
|
url: str,
|
||||||
|
title: str = "",
|
||||||
|
description: str = "",
|
||||||
|
content: str = "") -> str:
|
||||||
"""
|
"""
|
||||||
Generate a prompt for the LLM to summarize an article.
|
Generate a prompt for the LLM to summarize an article.
|
||||||
|
|
||||||
@@ -124,10 +147,12 @@ class NewsFetcher:
|
|||||||
if description:
|
if description:
|
||||||
context_info.append(f"RSS-Beschreibung: {description}")
|
context_info.append(f"RSS-Beschreibung: {description}")
|
||||||
if content:
|
if content:
|
||||||
content_preview = content[:500] + "..." if len(content) > 500 else content
|
content_preview = content[:500] + \
|
||||||
|
"..." if len(content) > 500 else content
|
||||||
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
||||||
|
|
||||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
context = "\n".join(
|
||||||
|
context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||||
|
|
||||||
return (
|
return (
|
||||||
"### Aufgabe\n"
|
"### Aufgabe\n"
|
||||||
@@ -171,9 +196,11 @@ class NewsFetcher:
|
|||||||
article_content = await NewsFetcher.fetch_article_content(client, url)
|
article_content = await NewsFetcher.fetch_article_content(client, url)
|
||||||
|
|
||||||
if not article_content:
|
if not article_content:
|
||||||
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
|
logger.warning(
|
||||||
|
f"⚠️ Could not fetch article content, using RSS data only")
|
||||||
|
|
||||||
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
|
prompt = NewsFetcher.build_prompt(
|
||||||
|
url, title, description, article_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": LLM_MODEL,
|
"model": LLM_MODEL,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
@@ -200,7 +227,8 @@ class NewsFetcher:
|
|||||||
|
|
||||||
# Validate required fields
|
# Validate required fields
|
||||||
required_fields = ["title", "description"]
|
required_fields = ["title", "description"]
|
||||||
missing_fields = [field for field in required_fields if field not in summary_data]
|
missing_fields = [
|
||||||
|
field for field in required_fields if field not in summary_data]
|
||||||
|
|
||||||
if missing_fields:
|
if missing_fields:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@@ -229,7 +257,9 @@ class NewsFetcher:
|
|||||||
logger.error(f"❌ HTTP error for {url}: {e}")
|
logger.error(f"❌ HTTP error for {url}: {e}")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Unexpected error summarizing {url}: {type(e).__name__}: {e}")
|
logger.error(
|
||||||
|
f"❌ Unexpected error summarizing {url}: {
|
||||||
|
type(e).__name__}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -264,7 +294,9 @@ class NewsFetcher:
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Critical error during harvest: {type(e).__name__}: {e}")
|
logger.error(
|
||||||
|
f"❌ Critical error during harvest: {
|
||||||
|
type(e).__name__}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -289,14 +321,20 @@ class NewsFetcher:
|
|||||||
feed_data = feedparser.parse(feed_row["url"])
|
feed_data = feedparser.parse(feed_row["url"])
|
||||||
|
|
||||||
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
if hasattr(feed_data, 'bozo') and feed_data.bozo:
|
||||||
logger.warning(f"⚠️ Feed has parsing issues: {feed_row['url']}")
|
logger.warning(
|
||||||
|
f"⚠️ Feed has parsing issues: {
|
||||||
|
feed_row['url']}")
|
||||||
if hasattr(feed_data, 'bozo_exception'):
|
if hasattr(feed_data, 'bozo_exception'):
|
||||||
logger.warning(f"⚠️ Feed exception: {feed_data.bozo_exception}")
|
logger.warning(
|
||||||
|
f"⚠️ Feed exception: {
|
||||||
|
feed_data.bozo_exception}")
|
||||||
|
|
||||||
total_entries = len(feed_data.entries)
|
total_entries = len(feed_data.entries)
|
||||||
|
|
||||||
if total_entries == 0:
|
if total_entries == 0:
|
||||||
logger.warning(f"⚠️ No entries found in feed: {feed_row['url']}")
|
logger.warning(
|
||||||
|
f"⚠️ No entries found in feed: {
|
||||||
|
feed_row['url']}")
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
for i, entry in enumerate(feed_data.entries, 1):
|
for i, entry in enumerate(feed_data.entries, 1):
|
||||||
@@ -321,18 +359,23 @@ class NewsFetcher:
|
|||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if article already exists - use readonly connection for better concurrency
|
# Check if article already exists - use readonly connection for
|
||||||
|
# better concurrency
|
||||||
try:
|
try:
|
||||||
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
with db_manager.get_cursor_with_retry(readonly=True) as cursor:
|
||||||
cursor.execute("SELECT id FROM news WHERE url = ?", (article_url,))
|
cursor.execute(
|
||||||
|
"SELECT id FROM news WHERE url = ?", (article_url,))
|
||||||
if cursor.fetchone():
|
if cursor.fetchone():
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
continue
|
continue
|
||||||
except Exception as db_error:
|
except Exception as db_error:
|
||||||
logger.warning(f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
logger.warning(
|
||||||
|
f"⚠️ Database check failed for article {i}, continuing: {db_error}")
|
||||||
|
|
||||||
rss_title = getattr(entry, 'title', '')
|
rss_title = getattr(entry, 'title', '')
|
||||||
rss_description = getattr(entry, 'description', '') or getattr(entry, 'summary', '')
|
rss_description = getattr(
|
||||||
|
entry, 'description', '') or getattr(
|
||||||
|
entry, 'summary', '')
|
||||||
|
|
||||||
summary = await NewsFetcher.summarize_article(
|
summary = await NewsFetcher.summarize_article(
|
||||||
client,
|
client,
|
||||||
@@ -342,7 +385,8 @@ class NewsFetcher:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not summary:
|
if not summary:
|
||||||
logger.warning(f"❌ Failed to get summary for article {i}: {article_url}")
|
logger.warning(
|
||||||
|
f"❌ Failed to get summary for article {i}: {article_url}")
|
||||||
stats['failed'] += 1
|
stats['failed'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -353,7 +397,7 @@ class NewsFetcher:
|
|||||||
cursor.execute(
|
cursor.execute(
|
||||||
"""
|
"""
|
||||||
INSERT
|
INSERT
|
||||||
OR IGNORE INTO news
|
OR IGNORE INTO news
|
||||||
(title, description, url, published, country)
|
(title, description, url, published, country)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
""",
|
""",
|
||||||
@@ -369,13 +413,17 @@ class NewsFetcher:
|
|||||||
stats['successful'] += 1
|
stats['successful'] += 1
|
||||||
|
|
||||||
except Exception as db_error:
|
except Exception as db_error:
|
||||||
logger.error(f"❌ Database error for article {i}: {db_error}")
|
logger.error(
|
||||||
|
f"❌ Database error for article {i}: {db_error}")
|
||||||
stats['failed'] += 1
|
stats['failed'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
await asyncio.sleep(0.01) # 10ms delay to yield control
|
await asyncio.sleep(0.01) # 10ms delay to yield control
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Error processing feed {feed_row['url']}: {type(e).__name__}: {e}")
|
logger.error(
|
||||||
|
f"❌ Error processing feed {
|
||||||
|
feed_row['url']}: {
|
||||||
|
type(e).__name__}: {e}")
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
Reference in New Issue
Block a user