By lejin2000 in IT Programming — 30 Oct 2024
Scrapy python to scrap a website to a word document script example

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
from docx import Document
import logging
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TCPTimedOutError
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from scrapy_splash import SplashRequest
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

class DynamicUniversitySpider(CrawlSpider):
    name = 'dynamic_university'
    allowed_domains = ['university.com']
    start_urls = ['https://university.com/']
    
    def __init__(self, *args, **kwargs):
        super(DynamicUniversitySpider, self).__init__(*args, **kwargs)
        self.setup_selenium()
        self.document = Document()
        self.setup_document()
        
    def setup_selenium(self):
        """Initialize Selenium WebDriver with appropriate options"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in headless mode
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
    
    def setup_document(self):
        """Initialize document formatting"""
        title_para = self.document.add_paragraph()
        title_run = title_para.add_run('University Content (Including Hidden Content)')
        title_run.bold = True
        title_run.font.size = Pt(24)
        title_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        self.document.add_page_break()

    def start_requests(self):
        """Override start_requests to use Selenium for initial page load"""
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse_with_selenium)

    def parse_with_selenium(self, response):
        """Parse page using Selenium to handle dynamic content"""
        try:
            self.driver.get(response.url)
            
            # Wait for the page to load
            time.sleep(2)  # Basic wait for initial load
            
            # Expand all expandable elements
            self.expand_all_hidden_content()
            
            # Extract content after expansion
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Process the expanded content
            self.process_content(soup, response.url)
            
            # Find and follow links
            links = self.driver.find_elements(By.TAG_NAME, 'a')
            for link in links:
                try:
                    href = link.get_attribute('href')
                    if href and self.allowed_domains[0] in href:
                        yield scrapy.Request(href, callback=self.parse_with_selenium)
                except Exception as e:
                    self.logger.error(f"Error processing link: {str(e)}")
                    
        except Exception as e:
            self.logger.error(f"Error processing {response.url}: {str(e)}")
            yield {
                'url': response.url,
                'status': 'error',
                'error': str(e)
            }

    def expand_all_hidden_content(self):
        """Expand all hidden content on the page"""
        try:
            # Common patterns for expandable content
            expandable_elements = [
                # Click on "Show more" or "Read more" buttons
                "//button[contains(text(), 'Show more') or contains(text(), 'Read more')]",
                "//a[contains(text(), 'Show more') or contains(text(), 'Read more')]",
                # Expand accordion elements
                "//div[contains(@class, 'accordion')]//button",
                # Toggle elements
                "//div[contains(@class, 'toggle')]",
                # Plus icons or expand icons
                "//i[contains(@class, 'expand') or contains(@class, 'plus')]",
                # Custom classes (add specific ones for your site)
                "//div[contains(@class, 'expandable')]",
                "//div[contains(@class, 'collapsible')]"
            ]
            
            for xpath in expandable_elements:
                try:
                    elements = self.driver.find_elements(By.XPATH, xpath)
                    for element in elements:
                        try:
                            if element.is_displayed() and element.is_enabled():
                                self.driver.execute_script("arguments[0].click();", element)
                                time.sleep(0.5)  # Wait for animation
                        except Exception as e:
                            self.logger.debug(f"Could not click element: {str(e)}")
                except Exception as e:
                    self.logger.debug(f"Error finding elements with xpath {xpath}: {str(e)}")
            
            # Scroll to bottom to trigger lazy loading
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            
        except Exception as e:
            self.logger.error(f"Error expanding hidden content: {str(e)}")

    def process_content(self, soup, url):
        """Process and save the expanded content"""
        try:
            # Add URL as header
            url_para = self.document.add_paragraph()
            url_run = url_para.add_run(f"Source: {url}")
            url_run.italic = True
            
            # Extract title
            title = soup.find('h1')
            if title:
                title_para = self.document.add_paragraph()
                title_run = title_para.add_run(title.text.strip())
                title_run.bold = True
                title_run.font.size = Pt(16)
            
            # Extract all visible and previously hidden content
            for elem in soup.find_all(['p', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol']):
                # Skip empty elements
                if not elem.text.strip():
                    continue
                    
                para = self.document.add_paragraph()
                
                if elem.name.startswith('h'):
                    # Handle headings
                    run = para.add_run(elem.text.strip())
                    run.bold = True
                    run.font.size = Pt(14)
                
                elif elem.name in ['ul', 'ol']:
                    # Handle lists
                    for li in elem.find_all('li'):
                        list_para = self.document.add_paragraph()
                        list_para.add_run('• ' + li.text.strip())
                
                else:
                    # Handle regular paragraphs
                    para.add_run(elem.text.strip())
            
            # Add page break between pages
            self.document.add_page_break()
            
        except Exception as e:
            self.logger.error(f"Error processing content: {str(e)}")

    def closed(self, reason):
        """Clean up resources and save document"""
        try:
            # Close Selenium WebDriver
            if hasattr(self, 'driver'):
                self.driver.quit()
            
            # Save the document
            self.document.save('University_Content_With_Hidden.docx')
            self.logger.info("Document saved successfully")
        except Exception as e:
            self.logger.error(f"Error during spider closure: {str(e)}")
        
        self.logger.info(f"Spider closed: {reason}")

# Enhanced settings for dynamic content handling
custom_settings = {
    'ROBOTSTXT_OBEY': True,
    'CONCURRENT_REQUESTS': 1,
    'DOWNLOAD_DELAY': 3,
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'COOKIES_ENABLED': True,
    'DOWNLOAD_TIMEOUT': 30,
    'RETRY_ENABLED': True,
    'RETRY_TIMES': 3,
    'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
    
    # Additional settings for handling JavaScript
    'DOWNLOADER_MIDDLEWARES': {
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    }
}
Subscribe to Code, Query, Ship, and Learn