enhance: add article content extraction and integrate with summarization process

This commit is contained in:
2025-08-01 18:55:55 +02:00
parent 003b8da4b2
commit 3a1c817381
3 changed files with 131 additions and 22 deletions

2
backend/.gitignore vendored
View File

@@ -54,3 +54,5 @@ logs/
.vscode/ .vscode/
*.swp *.swp
*.swo *.swo
/owlynews.sqlite-shm
/owlynews.sqlite-wal

View File

@@ -1,3 +1,4 @@
""" """
Owly News Summariser Backend Owly News Summariser Backend
@@ -12,6 +13,7 @@ import asyncio
import json import json
import os import os
import sqlite3 import sqlite3
import re
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from http.client import HTTPException from http.client import HTTPException
@@ -25,6 +27,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
from fastapi import FastAPI, Response, status, Depends from fastapi import FastAPI, Response, status, Depends
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel from pydantic import BaseModel
from bs4 import BeautifulSoup
# Constants # Constants
DB_PATH = Path("owlynews.sqlite") DB_PATH = Path("owlynews.sqlite")
@@ -36,6 +39,8 @@ SYNC_COOLDOWN_MINUTES = 30
LLM_MODEL = "qwen2:7b-instruct-q4_K_M" LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
LLM_TIMEOUT_SECONDS = 180 LLM_TIMEOUT_SECONDS = 180
OLLAMA_API_TIMEOUT_SECONDS = 10 OLLAMA_API_TIMEOUT_SECONDS = 10
ARTICLE_FETCH_TIMEOUT = 30
MAX_ARTICLE_LENGTH = 5000 # Max characters from article content
# Add logging configuration at the top of your file # Add logging configuration at the top of your file
logging.basicConfig( logging.basicConfig(
@@ -279,7 +284,97 @@ class NewsFetcher:
""" """
@staticmethod @staticmethod
def build_prompt(url: str, title: str = "", description: str = "") -> str: async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
"""
Fetch and extract the main content from an article URL.
Args:
client: An active httpx AsyncClient for making requests
url: URL of the article to fetch
Returns:
Extracted text content from the article, or empty string if failed
"""
try:
logger.debug(f"🌐 Fetching article content from: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = await client.get(
url,
headers=headers,
timeout=ARTICLE_FETCH_TIMEOUT,
follow_redirects=True
)
response.raise_for_status()
# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
element.decompose()
# Try to find main content areas
content_selectors = [
'article',
'[role="main"]',
'.content',
'.article-content',
'.post-content',
'.entry-content',
'.main-content',
'main',
'.story-body',
'.article-body'
]
article_text = ""
# Try each selector until we find content
for selector in content_selectors:
elements = soup.select(selector)
if elements:
# Get text from all matching elements
for element in elements:
text = element.get_text(separator=' ', strip=True)
if len(text) > len(article_text):
article_text = text
break
# Fallback: get text from body if no specific content area found
if not article_text:
body = soup.find('body')
if body:
article_text = body.get_text(separator=' ', strip=True)
# Clean up the text
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
article_text = article_text.strip()
# Limit length to avoid overwhelming the LLM
if len(article_text) > MAX_ARTICLE_LENGTH:
article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
logger.debug(f"✂️ Truncated article content to {MAX_ARTICLE_LENGTH} characters")
logger.debug(f"📄 Extracted {len(article_text)} characters from article")
return article_text
except httpx.TimeoutException:
logger.warning(f"⏰ Timeout fetching article content from: {url}")
return ""
except httpx.HTTPError as e:
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
return ""
except Exception as e:
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
return ""
@staticmethod
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
""" """
Generate a prompt for the LLM to summarize an article. Generate a prompt for the LLM to summarize an article.
@@ -287,6 +382,7 @@ class NewsFetcher:
url: Public URL of the article to summarize url: Public URL of the article to summarize
title: Article title from RSS feed (optional) title: Article title from RSS feed (optional)
description: Article description from RSS feed (optional) description: Article description from RSS feed (optional)
content: Extracted article content (optional)
Returns: Returns:
A formatted prompt string that instructs the LLM to generate A formatted prompt string that instructs the LLM to generate
@@ -294,9 +390,13 @@ class NewsFetcher:
""" """
context_info = [] context_info = []
if title: if title:
context_info.append(f"Titel: {title}") context_info.append(f"RSS-Titel: {title}")
if description: if description:
context_info.append(f"Beschreibung: {description}") context_info.append(f"RSS-Beschreibung: {description}")
if content:
# Show first part of content for context
content_preview = content[:500] + "..." if len(content) > 500 else content
context_info.append(f"Artikel-Inhalt: {content_preview}")
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar." context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
@@ -306,14 +406,15 @@ class NewsFetcher:
f"URL: {url}\n" f"URL: {url}\n"
f"Verfügbare Informationen:\n{context}\n\n" f"Verfügbare Informationen:\n{context}\n\n"
"### Regeln\n" "### Regeln\n"
"1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n" "1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
"2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n" "2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
"3. Gib ausschließlich **gültiges minifiziertes JSON** zurück kein Markdown, keine Kommentare\n" "3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
"4. Struktur: {\"title\":\"\",\"summary_de\":\"\",\"summary_en\":\"\"}\n" "4. Gib ausschließlich **gültiges minifiziertes JSON** zurück kein Markdown, keine Kommentare\n"
"5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n" "5. Struktur: {\"title\":\"\",\"summary_de\":\"\",\"summary_en\":\"\"}\n"
"6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n" "6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
"7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n" "7. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
"8. Kein Text vor oder nach dem JSON\n\n" "8. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
"9. Kein Text vor oder nach dem JSON\n\n"
"### Ausgabe\n" "### Ausgabe\n"
"Jetzt antworte mit dem JSON:" "Jetzt antworte mit dem JSON:"
) )
@@ -327,6 +428,7 @@ class NewsFetcher:
) -> Optional[ArticleSummary]: ) -> Optional[ArticleSummary]:
""" """
Generate a summary of an article using the LLM. Generate a summary of an article using the LLM.
Now fetches the actual article content for more accurate summaries.
Args: Args:
client: An active httpx AsyncClient for making requests client: An active httpx AsyncClient for making requests
@@ -342,7 +444,16 @@ class NewsFetcher:
logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title") logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description") logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")
prompt = NewsFetcher.build_prompt(url, title, description) # Fetch article content
logger.debug(f"🌐 Fetching article content...")
article_content = await NewsFetcher.fetch_article_content(client, url)
if article_content:
logger.info(f"✅ Successfully fetched article content ({len(article_content)} chars)")
else:
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
payload = { payload = {
"model": LLM_MODEL, "model": LLM_MODEL,
"prompt": prompt, "prompt": prompt,
@@ -472,6 +583,7 @@ class NewsFetcher:
) -> Dict[str, int]: ) -> Dict[str, int]:
""" """
Process a single feed, fetching and summarizing all articles. Process a single feed, fetching and summarizing all articles.
Now saves summaries immediately to the database.
Args: Args:
client: An active httpx AsyncClient for making requests client: An active httpx AsyncClient for making requests
@@ -509,7 +621,7 @@ class NewsFetcher:
continue continue
if not hasattr(entry, "published_parsed"): if not hasattr(entry, "published_parsed"):
logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5 logger.debug(f"⏩ Skipping entry {i}: no published date")
stats['skipped'] += 1 stats['skipped'] += 1
continue continue
@@ -557,15 +669,9 @@ class NewsFetcher:
published_timestamp = int(published.timestamp()) published_timestamp = int(published.timestamp())
# Handle source field - it can be a string or dict logger.debug(f"💾 Storing article in database immediately after summarization")
source_value = entry.get("source", feed_row["url"])
if isinstance(source_value, dict):
source_title = source_value.get("title", feed_row["url"])
else:
source_title = source_value if source_value else feed_row["url"]
logger.debug(f"💾 Storing article in database")
# Store in database immediately after successful summarization
# Store in database # Store in database
try: try:
with db_manager.get_cursor() as cursor: with db_manager.get_cursor() as cursor:
@@ -583,7 +689,7 @@ class NewsFetcher:
) )
) )
logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...") logger.info(f"✅ Successfully processed and stored article {i}: {summary['title'][:50]}...")
stats['successful'] += 1 stats['successful'] += 1
except Exception as db_error: except Exception as db_error:

View File

@@ -8,3 +8,4 @@ uvicorn[standard]
python-multipart python-multipart
psycopg2-binary psycopg2-binary
sqlalchemy sqlalchemy
beautifulsoup4