Python

Python

Explore python code snippets and tutorials

Python

Example of a crawler spider with scrapy

<p>This is a Scrapy spider script that crawls a website and extracts information from the web pages it visits. Here&#39;s an overview of what the script does:</p> <p>1. It imports …

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urljoin, urlparse


# class ExternalLinkExtractor(LinkExtractor):
#     def extract_links(self, response):
#         links = super().extract_links(response)
#         allowed_domains = self.allow_domains
#         for link in links:
#             parsed_url = urlparse(link.url)
#             if parsed_url.netloc and parsed_url.netloc not in allowed_domains:
#                 yield link



class SearchSpider(CrawlSpider):
    name = "search"
    allowed_domains = ["example.com"]
    start_urls = ["https://example.com/index.html"]

    rules = (
        Rule(
            LinkExtractor(allow=()),
            callback="parse_item",
            follow=True
        ),
    )


    def parse_item(self, response):
        # get the content type of the response
        content_type = response.headers.get('Content-Type').decode('utf-8')
        # get the body of the response and remove whitespace
        body = re.sub(r'\s+', ' ', response.body.decode('utf-8')).strip()
        
        # the url of the page
        page_url = response.url
        
        # the referer of the page 
        referer = response.request.headers.get('Referer')
        # the domain of the response url
        domain = response.url.split('/')[2]
        
        # get all urls of the page 
        urls = response.xpath('//a/@href').extract()
        cleaned_urls =[]
        elements = []
        for url in urls:
            cleaned_url = urljoin(response.url, url)
            cleaned_urls.append(cleaned_url)
        
        # get all elements of the page 
        for element in response.xpath('//*'):
            element_attributes = []

            # get all attributes of an element
            for attr in element.attrib:
                attribute = {}
                attr_value = element.attrib[attr]
                if attr_value.strip() != '':
                    attribute[attr] = attr_value
                    element_attributes.append(attribute)
            element_text = element.get()
            element_name = element.xpath('name()').get()
            elements.append({
                'name': element_name,
                'value': element_text,
                'attributes': element_attributes
            })
        yield {
            'url':page_url,
            'domain':domain,
            'response':body,
            'urls':cleaned_urls,
            'content_type':content_type,
            'elements':elements,
            'referer': referer,

        }
Python

example multi level dict in python

<ol> <li><code>my_dict</code> is the main dictionary that contains two key-value pairs.</li> <li>The keys in the <code>my_dict</code> are <code>level1_key1</code> and <code>level1_key2</code>.</li> <li>The values corresponding to each key in <code>my_dict</code> are also …

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
my_dict = {
    "level1_key1": {
        "level2_key1": "value1",
        "level2_key2": "value2"
    },
    "level1_key2": {
        "level2_key3": "value3",
        "level2_key4": "value4"
    }
}
Python

Generate random hex color code in python

<p>Is a Python script that generates random hexadecimal color codes and prints them repeatedly with a one-second delay. The <code>get_random_color()</code> function generates a random color by randomly selecting characters from …

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import random
import time

def get_random_color():
    letters = '0123456789ABCDEF'
    color = '#'
    for i in range(6):
        color += letters[random.randint(0, 15)]
    return color

while True:
    color = get_random_color()
    print(color)
    time.sleep(1)
Python

Generate a string based on permutations

<p>The provided code generates all possible permutations of the elements in the list `my_list` and then concatenates each permutation into a single string. Let&#39;s understand the code step by step:</p> …

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import itertools

# Define the list of elements
my_list = ['a', 'b', 'c']

# Generate permutations of the list
permutations = list(itertools.permutations(my_list))

# Concatenate elements from each permutation into a string
result = ''
for perm in permutations:
    result += ''.join(perm)

print(result)
Python

Write python response data to json file

<p>This Python code is a script that sends an HTTP GET request to a URL (in this case, &#39;<a href="https://google.com/" target="_new">https://google.com</a>&#39;) using the <code>requests</code> library and saves the response data …

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import requests

url = 'https://google.com'
headers = {}

response = requests.get(url, headers=headers)
response_headers = response.headers['Content-Type']

if response.status_code == 200:
    if 'json' in response_headers:
        filetype = 'json'
        try:
            content = response.json()
        except json.JSONDecodeError:
            print("Error: Response content is not in JSON format.")
            content = None
    else:
        filetype = 'txt'  # You can set a default extension for other content types.
        content = response.content

    filename = "response_data"
    with open(f"{filename}.{filetype}", 'w', encoding='utf-8') as f:
        if content is not None:
            if filetype == 'json':
                json.dump(content, f, ensure_ascii=False, indent=4)
            else:
                f.write(content)
    print(f"Response saved to {filename}.{filetype}")
else:
    print(f"Failed to fetch data. Status Code: {response.status_code}")