import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
from docx import Document
import logging
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TCPTimedOutError
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from scrapy_splash import SplashRequest
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
class DynamicUniversitySpider(CrawlSpider):
name = 'dynamic_university'
allowed_domains = ['university.com']
start_urls = ['https://university.com/']
def __init__(self, *args, **kwargs):
super(DynamicUniversitySpider, self).__init__(*args, **kwargs)
self.setup_selenium()
self.document = Document()
self.setup_document()
def setup_selenium(self):
"""Initialize Selenium WebDriver with appropriate options"""
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=chrome_options)
self.wait = WebDriverWait(self.driver, 10)
def setup_document(self):
"""Initialize document formatting"""
title_para = self.document.add_paragraph()
title_run = title_para.add_run('University Content (Including Hidden Content)')
title_run.bold = True
title_run.font.size = Pt(24)
title_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
self.document.add_page_break()
def start_requests(self):
"""Override start_requests to use Selenium for initial page load"""
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse_with_selenium)
def parse_with_selenium(self, response):
"""Parse page using Selenium to handle dynamic content"""
try:
self.driver.get(response.url)
# Wait for the page to load
time.sleep(2) # Basic wait for initial load
# Expand all expandable elements
self.expand_all_hidden_content()
# Extract content after expansion
page_source = self.driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Process the expanded content
self.process_content(soup, response.url)
# Find and follow links
links = self.driver.find_elements(By.TAG_NAME, 'a')
for link in links:
try:
href = link.get_attribute('href')
if href and self.allowed_domains[0] in href:
yield scrapy.Request(href, callback=self.parse_with_selenium)
except Exception as e:
self.logger.error(f"Error processing link: {str(e)}")
except Exception as e:
self.logger.error(f"Error processing {response.url}: {str(e)}")
yield {
'url': response.url,
'status': 'error',
'error': str(e)
}
def expand_all_hidden_content(self):
"""Expand all hidden content on the page"""
try:
# Common patterns for expandable content
expandable_elements = [
# Click on "Show more" or "Read more" buttons
"//button[contains(text(), 'Show more') or contains(text(), 'Read more')]",
"//a[contains(text(), 'Show more') or contains(text(), 'Read more')]",
# Expand accordion elements
"//div[contains(@class, 'accordion')]//button",
# Toggle elements
"//div[contains(@class, 'toggle')]",
# Plus icons or expand icons
"//i[contains(@class, 'expand') or contains(@class, 'plus')]",
# Custom classes (add specific ones for your site)
"//div[contains(@class, 'expandable')]",
"//div[contains(@class, 'collapsible')]"
]
for xpath in expandable_elements:
try:
elements = self.driver.find_elements(By.XPATH, xpath)
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
self.driver.execute_script("arguments[0].click();", element)
time.sleep(0.5) # Wait for animation
except Exception as e:
self.logger.debug(f"Could not click element: {str(e)}")
except Exception as e:
self.logger.debug(f"Error finding elements with xpath {xpath}: {str(e)}")
# Scroll to bottom to trigger lazy loading
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
except Exception as e:
self.logger.error(f"Error expanding hidden content: {str(e)}")
def process_content(self, soup, url):
"""Process and save the expanded content"""
try:
# Add URL as header
url_para = self.document.add_paragraph()
url_run = url_para.add_run(f"Source: {url}")
url_run.italic = True
# Extract title
title = soup.find('h1')
if title:
title_para = self.document.add_paragraph()
title_run = title_para.add_run(title.text.strip())
title_run.bold = True
title_run.font.size = Pt(16)
# Extract all visible and previously hidden content
for elem in soup.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
# Skip empty elements
if not elem.text.strip():
continue
para = self.document.add_paragraph()
if elem.name.startswith('h'):
# Handle headings
run = para.add_run(elem.text.strip())
run.bold = True
run.font.size = Pt(14)
elif elem.name in ['ul', 'ol']:
# Handle lists
for li in elem.find_all('li'):
list_para = self.document.add_paragraph()
list_para.add_run('• ' + li.text.strip())
else:
# Handle regular paragraphs
para.add_run(elem.text.strip())
# Add page break between pages
self.document.add_page_break()
except Exception as e:
self.logger.error(f"Error processing content: {str(e)}")
def closed(self, reason):
"""Clean up resources and save document"""
try:
# Close Selenium WebDriver
if hasattr(self, 'driver'):
self.driver.quit()
# Save the document
self.document.save('University_Content_With_Hidden.docx')
self.logger.info("Document saved successfully")
except Exception as e:
self.logger.error(f"Error during spider closure: {str(e)}")
self.logger.info(f"Spider closed: {reason}")
# Enhanced settings for dynamic content handling
custom_settings = {
'ROBOTSTXT_OBEY': True,
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'COOKIES_ENABLED': True,
'DOWNLOAD_TIMEOUT': 30,
'RETRY_ENABLED': True,
'RETRY_TIMES': 3,
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
# Additional settings for handling JavaScript
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
}