Add File
This commit is contained in:
281
src/landppt/services/research/content_extractor.py
Normal file
281
src/landppt/services/research/content_extractor.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
"""
|
||||||
|
Web Content Extraction Pipeline for Research Functionality
|
||||||
|
|
||||||
|
This module provides robust web content extraction using BeautifulSoup to fetch
|
||||||
|
and parse HTML content from web pages, with proper error handling and content cleaning.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Optional, Any, Set
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
|
from ...core.config import ai_config
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractedContent:
|
||||||
|
"""Represents extracted content from a web page"""
|
||||||
|
|
||||||
|
def __init__(self, url: str, title: str = "", content: str = "",
|
||||||
|
metadata: Optional[Dict[str, Any]] = None):
|
||||||
|
self.url = url
|
||||||
|
self.title = title
|
||||||
|
self.content = content
|
||||||
|
self.metadata = metadata or {}
|
||||||
|
self.extraction_time = time.time()
|
||||||
|
self.word_count = len(content.split()) if content else 0
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert to dictionary for serialization"""
|
||||||
|
return {
|
||||||
|
'url': self.url,
|
||||||
|
'title': self.title,
|
||||||
|
'content': self.content,
|
||||||
|
'metadata': self.metadata,
|
||||||
|
'extraction_time': self.extraction_time,
|
||||||
|
'word_count': self.word_count
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class WebContentExtractor:
|
||||||
|
"""Web content extraction pipeline using BeautifulSoup"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.timeout = ai_config.research_extraction_timeout
|
||||||
|
self.max_content_length = ai_config.research_max_content_length
|
||||||
|
self.user_agent = "LandPPT Research Bot 1.0"
|
||||||
|
|
||||||
|
# Content selectors for different types of content
|
||||||
|
self.content_selectors = [
|
||||||
|
'article',
|
||||||
|
'main',
|
||||||
|
'.content',
|
||||||
|
'.post-content',
|
||||||
|
'.entry-content',
|
||||||
|
'.article-content',
|
||||||
|
'.story-body',
|
||||||
|
'.post-body',
|
||||||
|
'#content',
|
||||||
|
'#main-content'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Tags to remove completely
|
||||||
|
self.remove_tags = {
|
||||||
|
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
||||||
|
'advertisement', 'ads', 'sidebar', 'menu', 'popup'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Text splitter for long content
|
||||||
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=self.max_content_length,
|
||||||
|
chunk_overlap=200,
|
||||||
|
length_function=len,
|
||||||
|
separators=["\n\n", "\n", ". ", " ", ""]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _clean_text(self, text: str) -> str:
|
||||||
|
"""Clean and normalize extracted text"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# Remove common unwanted patterns
|
||||||
|
text = re.sub(r'(Cookie|Privacy) Policy.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(r'Subscribe.*?newsletter.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(r'Follow us on.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(r'Share this.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# Remove URLs
|
||||||
|
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
||||||
|
"""Extract metadata from HTML"""
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
# Basic metadata
|
||||||
|
if soup.title:
|
||||||
|
metadata['title'] = soup.title.string.strip() if soup.title.string else ""
|
||||||
|
|
||||||
|
# Meta tags
|
||||||
|
meta_tags = soup.find_all('meta')
|
||||||
|
for tag in meta_tags:
|
||||||
|
name = tag.get('name') or tag.get('property')
|
||||||
|
content = tag.get('content')
|
||||||
|
if name and content:
|
||||||
|
metadata[name] = content
|
||||||
|
|
||||||
|
# Language
|
||||||
|
html_tag = soup.find('html')
|
||||||
|
if html_tag and html_tag.get('lang'):
|
||||||
|
metadata['language'] = html_tag.get('lang')
|
||||||
|
|
||||||
|
# Domain
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
metadata['domain'] = parsed_url.netloc
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def _extract_main_content(self, soup: BeautifulSoup) -> str:
|
||||||
|
"""Extract main content from HTML using various strategies"""
|
||||||
|
|
||||||
|
# Remove unwanted tags
|
||||||
|
for tag_name in self.remove_tags:
|
||||||
|
for tag in soup.find_all(tag_name):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
# Remove comments
|
||||||
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||||
|
comment.extract()
|
||||||
|
|
||||||
|
# Try content selectors in order of preference
|
||||||
|
for selector in self.content_selectors:
|
||||||
|
content_element = soup.select_one(selector)
|
||||||
|
if content_element:
|
||||||
|
text = content_element.get_text(separator=' ', strip=True)
|
||||||
|
if len(text) > 100: # Minimum content length
|
||||||
|
return self._clean_text(text)
|
||||||
|
|
||||||
|
# Fallback: extract from body
|
||||||
|
body = soup.find('body')
|
||||||
|
if body:
|
||||||
|
# Remove navigation, sidebar, and footer elements
|
||||||
|
for element in body.find_all(['nav', 'aside', 'footer', 'header']):
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
text = body.get_text(separator=' ', strip=True)
|
||||||
|
return self._clean_text(text)
|
||||||
|
|
||||||
|
# Last resort: get all text
|
||||||
|
return self._clean_text(soup.get_text(separator=' ', strip=True))
|
||||||
|
|
||||||
|
async def extract_content(self, url: str) -> Optional[ExtractedContent]:
|
||||||
|
"""
|
||||||
|
Extract content from a single URL
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to extract content from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractedContent object or None if extraction fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
headers = {
|
||||||
|
'User-Agent': self.user_agent,
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'Accept-Encoding': 'gzip, deflate',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
}
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(
|
||||||
|
timeout=aiohttp.ClientTimeout(total=self.timeout),
|
||||||
|
headers=headers
|
||||||
|
) as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
logger.warning(f"Failed to fetch {url}: HTTP {response.status}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check content type
|
||||||
|
content_type = response.headers.get('content-type', '').lower()
|
||||||
|
if 'text/html' not in content_type:
|
||||||
|
logger.warning(f"Skipping non-HTML content: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
html_content = await response.text()
|
||||||
|
|
||||||
|
# Parse HTML
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = self._extract_metadata(soup, url)
|
||||||
|
|
||||||
|
# Extract main content
|
||||||
|
content = self._extract_main_content(soup)
|
||||||
|
|
||||||
|
# Limit content length
|
||||||
|
if len(content) > self.max_content_length:
|
||||||
|
chunks = self.text_splitter.split_text(content)
|
||||||
|
content = chunks[0] if chunks else content[:self.max_content_length]
|
||||||
|
|
||||||
|
# Get title
|
||||||
|
title = metadata.get('title', '')
|
||||||
|
if not title and soup.title:
|
||||||
|
title = soup.title.string.strip() if soup.title.string else ""
|
||||||
|
|
||||||
|
extracted = ExtractedContent(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
content=content,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {extracted.word_count} words from {url}")
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Timeout extracting content from {url}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error extracting content from {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def extract_multiple(self, urls: List[str],
|
||||||
|
max_concurrent: int = 5,
|
||||||
|
delay_between_requests: float = 0.5) -> List[ExtractedContent]:
|
||||||
|
"""
|
||||||
|
Extract content from multiple URLs with concurrency control
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to extract content from
|
||||||
|
max_concurrent: Maximum concurrent requests
|
||||||
|
delay_between_requests: Delay between requests in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ExtractedContent objects
|
||||||
|
"""
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrent)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
async def extract_with_semaphore(url: str) -> Optional[ExtractedContent]:
|
||||||
|
async with semaphore:
|
||||||
|
result = await self.extract_content(url)
|
||||||
|
if delay_between_requests > 0:
|
||||||
|
await asyncio.sleep(delay_between_requests)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Create tasks for all URLs
|
||||||
|
tasks = [extract_with_semaphore(url) for url in urls]
|
||||||
|
|
||||||
|
# Execute tasks and collect results
|
||||||
|
completed_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
for result in completed_results:
|
||||||
|
if isinstance(result, ExtractedContent):
|
||||||
|
results.append(result)
|
||||||
|
elif isinstance(result, Exception):
|
||||||
|
logger.warning(f"Content extraction failed: {result}")
|
||||||
|
|
||||||
|
logger.info(f"Successfully extracted content from {len(results)}/{len(urls)} URLs")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def get_status(self) -> Dict[str, Any]:
|
||||||
|
"""Get extractor status information"""
|
||||||
|
return {
|
||||||
|
'timeout': self.timeout,
|
||||||
|
'max_content_length': self.max_content_length,
|
||||||
|
'user_agent': self.user_agent,
|
||||||
|
'content_selectors': self.content_selectors,
|
||||||
|
'remove_tags': list(self.remove_tags)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user