January 16, 2024

Python Crawler Repo

Repo for crawler I used to crawl novels on the web.

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding:utf-8 -*-
import requests
import re
import time
import random

# ======== All variables =========
# Book url
book_url = 'https://xxx.com/book/123/'
# Book single chapter url
single_chapter_url = ''
# Book all chapter url
chapter_index_url_list = []
# Save book index url to txt file path
novel_index_path = 'novel_index.txt'
# Save the book to the txt file path
novel_path = 'novel.txt'
# Counter, used to count chapters crawled.
count = 0
# Timer, used to count total crawling time.
timeCount = 0
# Simulate browser header to lower the risk being recognised as crawler
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/500.66 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/500.66'
}


# ========= Regex Pattern =========
# Regex for extract book index url
single_chapter_pattern = '<li num="(\d+)"><a href="(.*?)">(.*?)<\/a><\/li>'
# Regex for extract book title
title_regex = '<h1 class="xxx">(.*?)</h1>'
# Regex for extract book contents
content_regex = '<br />\s*&emsp;&emsp;(.*?)\s*<br />'


# ====== Test code ======
# Check for book index url
# response_1 = requests.get(book_url, headers=headers)
# response_1.encoding = 'gbk'
# print(response_1.text)

# Check for title and contents
# response_2 = requests.get('https://xxx.com/txt/123/123', headers=headers)
# response_2.encoding = 'gbk'
# print(response_2.text)
# title = re.findall(title_regex, response_2.text)
# content = re.findall(content_regex, response_2.text)
# print(title)
# print(content)


# ========= Extract all book index url =========
# Using requests library get method to send request to book website and receive response to response_1
response_1 = requests.get(book_url, headers=headers)
# Website encoding
response_1.encoding = 'utf-8'
# Extract all book index url
matches = re.finditer(single_chapter_pattern, response_1.text)
# Extracted content
results = [(match.group(1), match.group(2)) for match in matches]
if results:
# Only want the link inside the list and ignore the chapter number so use url to store chapter url
for cn, url in results:
single_chapter_url = url
chapter_index_url_list.append(single_chapter_url)
else:
print("No match found.")
print("In total " + str(len(chapter_index_url_list)) + "chapters.")


# ========= Save book index url to txt file =========
# with open(novel_index_path, 'a+', encoding="utf-8") as file:
# for c in chapter_index_url_list:
# file.write(c + '\n')


# ========= Extract title and content =========
print("Start crawling...")
# Open txt file for writing,a+ means in append mode
with open(novel_path, 'a+', encoding="gbk") as file:
# Traverse book index url list
for x in chapter_index_url_list:
# Send request to book index url until no error code 503 - Need to improve
while 1:
response_2 = requests.get(x, headers=headers)
if '503 Service Temporarily Unavailable' not in response_2.text:
break
else:
print('Data lose, restart in 3s')
time.sleep(3)
# Encoding
response_2.encoding = "utf-8"
# Extract title in list format [ ]
title = re.findall(title_regex, response_2.text)
# Extract content in list format [ ]
contents = re.findall(content_regex, response_2.text)
# Write title to txt file
file.write(title[0] + '\n')
# Write content to txt file, line by line
for content in contents:
file.write(content + '\n' + '\n')
file.write('\n' + '\n')
# Add 1 for each successful write/crawling
count += 1
# Print out current progress
print('Chapter {}, Title: {}, Done!'.format(count, title[0]))
# Generate a random number of time to simulate a human for different reading times
randomSleepTime = random.randint(12, 17)
time.sleep(randomSleepTime)
# Record total crawling time
timeCount += randomSleepTime
# Close file
file.close()

print('Finished crawling!In total cost{}seconds'.format(timeCount))

Need to improve:

About this Post

This post is written by Andy, licensed under CC BY-NC 4.0.

#Python#Crawler