Example of a crawler spider with scrapy
<p>This is a Scrapy spider script that crawls a website and extracts information from the web pages it visits. Here's an overview of what the script does:</p> <p>1. It imports …
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urljoin, urlparse
# class ExternalLinkExtractor(LinkExtractor):
# def extract_links(self, response):
# links = super().extract_links(response)
# allowed_domains = self.allow_domains
# for link in links:
# parsed_url = urlparse(link.url)
# if parsed_url.netloc and parsed_url.netloc not in allowed_domains:
# yield link
class SearchSpider(CrawlSpider):
name = "search"
allowed_domains = ["example.com"]
start_urls = ["https://example.com/index.html"]
rules = (
Rule(
LinkExtractor(allow=()),
callback="parse_item",
follow=True
),
)
def parse_item(self, response):
# get the content type of the response
content_type = response.headers.get('Content-Type').decode('utf-8')
# get the body of the response and remove whitespace
body = re.sub(r'\s+', ' ', response.body.decode('utf-8')).strip()
# the url of the page
page_url = response.url
# the referer of the page
referer = response.request.headers.get('Referer')
# the domain of the response url
domain = response.url.split('/')[2]
# get all urls of the page
urls = response.xpath('//a/@href').extract()
cleaned_urls =[]
elements = []
for url in urls:
cleaned_url = urljoin(response.url, url)
cleaned_urls.append(cleaned_url)
# get all elements of the page
for element in response.xpath('//*'):
element_attributes = []
# get all attributes of an element
for attr in element.attrib:
attribute = {}
attr_value = element.attrib[attr]
if attr_value.strip() != '':
attribute[attr] = attr_value
element_attributes.append(attribute)
element_text = element.get()
element_name = element.xpath('name()').get()
elements.append({
'name': element_name,
'value': element_text,
'attributes': element_attributes
})
yield {
'url':page_url,
'domain':domain,
'response':body,
'urls':cleaned_urls,
'content_type':content_type,
'elements':elements,
'referer': referer,
}
|