enhance: add article content extraction and integrate with summarization process
This commit is contained in:
2
backend/.gitignore
vendored
2
backend/.gitignore
vendored
@@ -54,3 +54,5 @@ logs/
|
|||||||
.vscode/
|
.vscode/
|
||||||
*.swp
|
*.swp
|
||||||
*.swo
|
*.swo
|
||||||
|
/owlynews.sqlite-shm
|
||||||
|
/owlynews.sqlite-wal
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
Owly News Summariser Backend
|
Owly News Summariser Backend
|
||||||
|
|
||||||
@@ -12,6 +13,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import re
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from http.client import HTTPException
|
from http.client import HTTPException
|
||||||
@@ -25,6 +27,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|||||||
from fastapi import FastAPI, Response, status, Depends
|
from fastapi import FastAPI, Response, status, Depends
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
DB_PATH = Path("owlynews.sqlite")
|
DB_PATH = Path("owlynews.sqlite")
|
||||||
@@ -36,6 +39,8 @@ SYNC_COOLDOWN_MINUTES = 30
|
|||||||
LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
|
LLM_MODEL = "qwen2:7b-instruct-q4_K_M"
|
||||||
LLM_TIMEOUT_SECONDS = 180
|
LLM_TIMEOUT_SECONDS = 180
|
||||||
OLLAMA_API_TIMEOUT_SECONDS = 10
|
OLLAMA_API_TIMEOUT_SECONDS = 10
|
||||||
|
ARTICLE_FETCH_TIMEOUT = 30
|
||||||
|
MAX_ARTICLE_LENGTH = 5000 # Max characters from article content
|
||||||
|
|
||||||
# Add logging configuration at the top of your file
|
# Add logging configuration at the top of your file
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -279,7 +284,97 @@ class NewsFetcher:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_prompt(url: str, title: str = "", description: str = "") -> str:
|
async def fetch_article_content(client: httpx.AsyncClient, url: str) -> str:
|
||||||
|
"""
|
||||||
|
Fetch and extract the main content from an article URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client: An active httpx AsyncClient for making requests
|
||||||
|
url: URL of the article to fetch
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted text content from the article, or empty string if failed
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"🌐 Fetching article content from: {url}")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.get(
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
timeout=ARTICLE_FETCH_TIMEOUT,
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Parse HTML content
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# Remove unwanted elements
|
||||||
|
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
# Try to find main content areas
|
||||||
|
content_selectors = [
|
||||||
|
'article',
|
||||||
|
'[role="main"]',
|
||||||
|
'.content',
|
||||||
|
'.article-content',
|
||||||
|
'.post-content',
|
||||||
|
'.entry-content',
|
||||||
|
'.main-content',
|
||||||
|
'main',
|
||||||
|
'.story-body',
|
||||||
|
'.article-body'
|
||||||
|
]
|
||||||
|
|
||||||
|
article_text = ""
|
||||||
|
|
||||||
|
# Try each selector until we find content
|
||||||
|
for selector in content_selectors:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
if elements:
|
||||||
|
# Get text from all matching elements
|
||||||
|
for element in elements:
|
||||||
|
text = element.get_text(separator=' ', strip=True)
|
||||||
|
if len(text) > len(article_text):
|
||||||
|
article_text = text
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fallback: get text from body if no specific content area found
|
||||||
|
if not article_text:
|
||||||
|
body = soup.find('body')
|
||||||
|
if body:
|
||||||
|
article_text = body.get_text(separator=' ', strip=True)
|
||||||
|
|
||||||
|
# Clean up the text
|
||||||
|
article_text = re.sub(r'\s+', ' ', article_text) # Normalize whitespace
|
||||||
|
article_text = article_text.strip()
|
||||||
|
|
||||||
|
# Limit length to avoid overwhelming the LLM
|
||||||
|
if len(article_text) > MAX_ARTICLE_LENGTH:
|
||||||
|
article_text = article_text[:MAX_ARTICLE_LENGTH] + "..."
|
||||||
|
logger.debug(f"✂️ Truncated article content to {MAX_ARTICLE_LENGTH} characters")
|
||||||
|
|
||||||
|
logger.debug(f"📄 Extracted {len(article_text)} characters from article")
|
||||||
|
return article_text
|
||||||
|
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
logger.warning(f"⏰ Timeout fetching article content from: {url}")
|
||||||
|
return ""
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.warning(f"🌐 HTTP error fetching article content from {url}: {e}")
|
||||||
|
return ""
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"❌ Error fetching article content from {url}: {type(e).__name__}: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_prompt(url: str, title: str = "", description: str = "", content: str = "") -> str:
|
||||||
"""
|
"""
|
||||||
Generate a prompt for the LLM to summarize an article.
|
Generate a prompt for the LLM to summarize an article.
|
||||||
|
|
||||||
@@ -287,6 +382,7 @@ class NewsFetcher:
|
|||||||
url: Public URL of the article to summarize
|
url: Public URL of the article to summarize
|
||||||
title: Article title from RSS feed (optional)
|
title: Article title from RSS feed (optional)
|
||||||
description: Article description from RSS feed (optional)
|
description: Article description from RSS feed (optional)
|
||||||
|
content: Extracted article content (optional)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A formatted prompt string that instructs the LLM to generate
|
A formatted prompt string that instructs the LLM to generate
|
||||||
@@ -294,9 +390,13 @@ class NewsFetcher:
|
|||||||
"""
|
"""
|
||||||
context_info = []
|
context_info = []
|
||||||
if title:
|
if title:
|
||||||
context_info.append(f"Titel: {title}")
|
context_info.append(f"RSS-Titel: {title}")
|
||||||
if description:
|
if description:
|
||||||
context_info.append(f"Beschreibung: {description}")
|
context_info.append(f"RSS-Beschreibung: {description}")
|
||||||
|
if content:
|
||||||
|
# Show first part of content for context
|
||||||
|
content_preview = content[:500] + "..." if len(content) > 500 else content
|
||||||
|
context_info.append(f"Artikel-Inhalt: {content_preview}")
|
||||||
|
|
||||||
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
context = "\n".join(context_info) if context_info else "Keine zusätzlichen Informationen verfügbar."
|
||||||
|
|
||||||
@@ -306,14 +406,15 @@ class NewsFetcher:
|
|||||||
f"URL: {url}\n"
|
f"URL: {url}\n"
|
||||||
f"Verfügbare Informationen:\n{context}\n\n"
|
f"Verfügbare Informationen:\n{context}\n\n"
|
||||||
"### Regeln\n"
|
"### Regeln\n"
|
||||||
"1. Nutze die verfügbaren Informationen (Titel, Beschreibung) und dein Wissen über die URL-Domain\n"
|
"1. Nutze VORRANGIG den Artikel-Inhalt falls verfügbar, ergänze mit RSS-Informationen\n"
|
||||||
"2. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
|
"2. Falls kein Artikel-Inhalt verfügbar ist, nutze RSS-Titel und -Beschreibung\n"
|
||||||
"3. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
|
"3. Falls keine ausreichenden Informationen vorliegen, erstelle eine plausible Zusammenfassung basierend auf der URL\n"
|
||||||
"4. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
|
"4. Gib ausschließlich **gültiges minifiziertes JSON** zurück – kein Markdown, keine Kommentare\n"
|
||||||
"5. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
|
"5. Struktur: {\"title\":\"…\",\"summary_de\":\"…\",\"summary_en\":\"…\"}\n"
|
||||||
"6. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
|
"6. title: Aussagekräftiger deutscher Titel (max 100 Zeichen)\n"
|
||||||
"7. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
|
"7. summary_de: Deutsche Zusammenfassung (max 160 Wörter)\n"
|
||||||
"8. Kein Text vor oder nach dem JSON\n\n"
|
"8. summary_en: Englische Zusammenfassung (max 160 Wörter)\n"
|
||||||
|
"9. Kein Text vor oder nach dem JSON\n\n"
|
||||||
"### Ausgabe\n"
|
"### Ausgabe\n"
|
||||||
"Jetzt antworte mit dem JSON:"
|
"Jetzt antworte mit dem JSON:"
|
||||||
)
|
)
|
||||||
@@ -327,6 +428,7 @@ class NewsFetcher:
|
|||||||
) -> Optional[ArticleSummary]:
|
) -> Optional[ArticleSummary]:
|
||||||
"""
|
"""
|
||||||
Generate a summary of an article using the LLM.
|
Generate a summary of an article using the LLM.
|
||||||
|
Now fetches the actual article content for more accurate summaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
client: An active httpx AsyncClient for making requests
|
client: An active httpx AsyncClient for making requests
|
||||||
@@ -342,7 +444,16 @@ class NewsFetcher:
|
|||||||
logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
|
logger.debug(f"📝 RSS Title: {title[:50]}..." if title else "📝 No RSS title")
|
||||||
logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")
|
logger.debug(f"📄 RSS Description: {description[:100]}..." if description else "📄 No RSS description")
|
||||||
|
|
||||||
prompt = NewsFetcher.build_prompt(url, title, description)
|
# Fetch article content
|
||||||
|
logger.debug(f"🌐 Fetching article content...")
|
||||||
|
article_content = await NewsFetcher.fetch_article_content(client, url)
|
||||||
|
|
||||||
|
if article_content:
|
||||||
|
logger.info(f"✅ Successfully fetched article content ({len(article_content)} chars)")
|
||||||
|
else:
|
||||||
|
logger.warning(f"⚠️ Could not fetch article content, using RSS data only")
|
||||||
|
|
||||||
|
prompt = NewsFetcher.build_prompt(url, title, description, article_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": LLM_MODEL,
|
"model": LLM_MODEL,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
@@ -472,6 +583,7 @@ class NewsFetcher:
|
|||||||
) -> Dict[str, int]:
|
) -> Dict[str, int]:
|
||||||
"""
|
"""
|
||||||
Process a single feed, fetching and summarizing all articles.
|
Process a single feed, fetching and summarizing all articles.
|
||||||
|
Now saves summaries immediately to the database.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
client: An active httpx AsyncClient for making requests
|
client: An active httpx AsyncClient for making requests
|
||||||
@@ -509,7 +621,7 @@ class NewsFetcher:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if not hasattr(entry, "published_parsed"):
|
if not hasattr(entry, "published_parsed"):
|
||||||
logger.debug(f"⏩ Skipping entry {i}: no published date") # TODO: change back to 0.5
|
logger.debug(f"⏩ Skipping entry {i}: no published date")
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -557,15 +669,9 @@ class NewsFetcher:
|
|||||||
|
|
||||||
published_timestamp = int(published.timestamp())
|
published_timestamp = int(published.timestamp())
|
||||||
|
|
||||||
# Handle source field - it can be a string or dict
|
logger.debug(f"💾 Storing article in database immediately after summarization")
|
||||||
source_value = entry.get("source", feed_row["url"])
|
|
||||||
if isinstance(source_value, dict):
|
|
||||||
source_title = source_value.get("title", feed_row["url"])
|
|
||||||
else:
|
|
||||||
source_title = source_value if source_value else feed_row["url"]
|
|
||||||
|
|
||||||
logger.debug(f"💾 Storing article in database")
|
|
||||||
|
|
||||||
|
# Store in database immediately after successful summarization
|
||||||
# Store in database
|
# Store in database
|
||||||
try:
|
try:
|
||||||
with db_manager.get_cursor() as cursor:
|
with db_manager.get_cursor() as cursor:
|
||||||
@@ -583,7 +689,7 @@ class NewsFetcher:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"✅ Successfully processed article {i}: {summary['title'][:50]}...")
|
logger.info(f"✅ Successfully processed and stored article {i}: {summary['title'][:50]}...")
|
||||||
stats['successful'] += 1
|
stats['successful'] += 1
|
||||||
|
|
||||||
except Exception as db_error:
|
except Exception as db_error:
|
||||||
|
@@ -8,3 +8,4 @@ uvicorn[standard]
|
|||||||
python-multipart
|
python-multipart
|
||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
|
beautifulsoup4
|
||||||
|
Reference in New Issue
Block a user