Python | Dive into Code

Example of a crawler spider with scrapy

This is a Scrapy spider script that crawls a website and extracts information from the web pages it visits. Here's an overview of what the script does: 1. It imports …

python

import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urljoin, urlparse


# class ExternalLinkExtractor(LinkExtractor):
#     def extract_links(self, response):
#         links = super().extract_links(response)
#         allowed_domains = self.allow_domains
#         for link in links:
#             parsed_url = urlparse(link.url)
#             if parsed_url.netloc and parsed_url.netloc not in allowed_domains:
#                 yield link



class SearchSpider(CrawlSpider):
    name = "search"
    allowed_domains = ["example.com"]
    start_urls = ["https://example.com/index.html"]

    rules = (
        Rule(
            LinkExtractor(allow=()),
            callback="parse_item",
            follow=True
        ),
    )


    def parse_item(self, response):
        # get the content type of the response
        content_type = response.headers.get('Content-Type').decode('utf-8')
        # get the body of the response and remove whitespace
        body = re.sub(r'\s+', ' ', response.body.decode('utf-8')).strip()
        
        # the url of the page
        page_url = response.url
        
        # the referer of the page 
        referer = response.request.headers.get('Referer')
        # the domain of the response url
        domain = response.url.split('/')[2]
        
        # get all urls of the page 
        urls = response.xpath('//a/@href').extract()
        cleaned_urls =[]
        elements = []
        for url in urls:
            cleaned_url = urljoin(response.url, url)
            cleaned_urls.append(cleaned_url)
        
        # get all elements of the page 
        for element in response.xpath('//*'):
            element_attributes = []

            # get all attributes of an element
            for attr in element.attrib:
                attribute = {}
                attr_value = element.attrib[attr]
                if attr_value.strip() != '':
                    attribute[attr] = attr_value
                    element_attributes.append(attribute)
            element_text = element.get()
            element_name = element.xpath('name()').get()
            elements.append({
                'name': element_name,
                'value': element_text,
                'attributes': element_attributes
            })
        yield {
            'url':page_url,
            'domain':domain,
            'response':body,
            'urls':cleaned_urls,
            'content_type':content_type,
            'elements':elements,
            'referer': referer,

        }

Jul 07, 2023 python

Python

example multi level dict in python

<ol> <li><code>my_dict</code> is the main dictionary that contains two key-value pairs.</li> <li>The keys in the <code>my_dict</code> are <code>level1_key1</code> and <code>level1_key2</code>.</li> <li>The values corresponding to each key in <code>my_dict</code> are also …

python

my_dict = {
    "level1_key1": {
        "level2_key1": "value1",
        "level2_key2": "value2"
    },
    "level1_key2": {
        "level2_key3": "value3",
        "level2_key4": "value4"
    }
}

May 05, 2023 python

Python

Generate random hex color code in python

Is a Python script that generates random hexadecimal color codes and prints them repeatedly with a one-second delay. The <code>get_random_color()</code> function generates a random color by randomly selecting characters from …

python

import random
import time

def get_random_color():
    letters = '0123456789ABCDEF'
    color = '#'
    for i in range(6):
        color += letters[random.randint(0, 15)]
    return color

while True:
    color = get_random_color()
    print(color)
    time.sleep(1)

May 02, 2023 python

Python

Generate a string based on permutations

The provided code generates all possible permutations of the elements in the list `my_list` and then concatenates each permutation into a single string. Let's understand the code step by step: …

python

import itertools

# Define the list of elements
my_list = ['a', 'b', 'c']

# Generate permutations of the list
permutations = list(itertools.permutations(my_list))

# Concatenate elements from each permutation into a string
result = ''
for perm in permutations:
    result += ''.join(perm)

print(result)

Apr 08, 2023 python

Python

Write python response data to json file

This Python code is a script that sends an HTTP GET request to a URL (in this case, '<a href="https://google.com/" target="_new">https://google.com</a>') using the <code>requests</code> library and saves the response data …

python

import json
import requests

url = 'https://google.com'
headers = {}

response = requests.get(url, headers=headers)
response_headers = response.headers['Content-Type']

if response.status_code == 200:
    if 'json' in response_headers:
        filetype = 'json'
        try:
            content = response.json()
        except json.JSONDecodeError:
            print("Error: Response content is not in JSON format.")
            content = None
    else:
        filetype = 'txt'  # You can set a default extension for other content types.
        content = response.content

    filename = "response_data"
    with open(f"{filename}.{filetype}", 'w', encoding='utf-8') as f:
        if content is not None:
            if filetype == 'json':
                json.dump(content, f, ensure_ascii=False, indent=4)
            else:
                f.write(content)
    print(f"Response saved to {filename}.{filetype}")
else:
    print(f"Failed to fetch data. Status Code: {response.status_code}")

Apr 04, 2023 python