玩玩爬虫
每日一句诗词
loading...
loading...
urllib
基本使用方法
pip install urllib
- 测试
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17import urllib.request
# 定义访问的url
url = 'http://www.baidu.com'
# 模拟浏览器发请求
response = urllib.request.urlopen(url)
# 获取状态码判断请求是否成功
# print(response.getcode()) #200
# print(response) # <http.client.HTTPResponse object at 0x0000017DC4629A30>
# 获取相应源码 read方法返回二进制 需要解码
# content = response.read().decode('utf-8')
# 按行读取
content = response.readlines()
print(content) # 输出网站源码 - 但是遇到https的网站还是会报错,加上UA就能正常读取了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24import urllib.request
import urllib.parse
url_page = 'https://www.baidu.com/s?'
# 模拟用户请求
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
# 将中文变成unicode编码
data = {
'wd': '周杰伦',
'gender': '男',
'location': '中国台湾'
}
params = urllib.parse.urlencode(data)
# 请求对象的创建 添加user-agent
request = urllib.request.Request(url=url_page + params, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content) #输出网站源码 - 保存图片、音频及视频文件
1
2
3
4
5
6
7
8
9
10
11import urllib.request
# 定义访问的url
# url_page = 'http://www.baidu.com'
url_img = 'https://img1.baidu.com/it/u=2835220188,4227150300&fm=253&fmt=auto&app=138&f=JPEG?w=500&h=585'
#url_video = 'http://www.baidu.com'
# 下载资源
# urllib.request.urlretrieve(url_img, './raw/baidu.html')
# 图片资源,第二个参数是文件路径
urllib.request.urlretrieve(url_img, './raw/lisa.jpg') - ip代理池
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30import urllib.request
import random
base_url = 'http://baidu.com/s?wd=ip'
# 封装请求
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Cookie': '__qc_wId=865; JSESSIONID=D06865AF09CD9D619CA1930C9EC055A6'
}
request = urllib.request.Request(url=base_url, headers=header)
# 代理ip池
proxies_pool = [
{'http': '120.24.76.81:8123'},
{'http': '120.24.76.81:8124'}
]
# 随机从代理池中取代理ip
proxies = random.choice(proxies_pool)
# handler bulider_opener open 使用这三个替换urlopen可获得更多的操作
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)综合练习(百度翻译)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34import urllib.request
import urllib.parse
import json
base_url = 'https://fanyi.baidu.com/sug' # 简单翻译
# 详细翻译
# base_url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
}
while True:
english = input('请输入要翻译的单词')
data = {
'kw': english
}
data = urllib.parse.urlencode(data).encode('utf-8')
# data 为接口的表单数据 用于post请求 必须编码 get请求需要与url拼接 无需编码
request = urllib.request.Request(base_url, data, headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)
result = json.loads(content)
# print(result)
if len(result['data']) > 0:
print(result['data'][0]['v'])
else:
print('i donot know')Xpath
获取网站源码,但是也许只需要一类div或者img等,下载xpath进行筛选
pip install lxml1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49from lxml import etree
import urllib.request
import random
# xpath 解析 本地文件解析etree.parse 服务器文件etree.HTML
base_url = 'https://codert.cn/'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=base_url, headers=header)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
# print(content)
# xpath基本语法
'''
1. 路径查询
//:查找所有的子孙节点不考虑层级关系
/:直接查找子节点
2. 谓词查询
//div[@id]
//div[@id="management"]
3. 熟悉查询
//@class
4.模糊查询
//div[contains("@id","he")]
//div[start_with("@id","he")]
5. 内容查询
//div/h1/text()
6. 逻辑运算
//div[@id='head' and @class='s_down']
//title | //price
'''
tree = etree.HTML(content)
result_list = tree.xpath('//img/@data-lazy-src') #获取网站懒加载图片地址
# print(len(result_list))
for temp in result_list:
try:
print(temp)
urllib.request.urlretrieve(temp,'./raw/douban/condert'+str(random.randint(1, 10000))+'.jpg')
except:
print('url is fault')requests
下载 pip install requests
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26import requests
from lxml import etree
'''
request属性及类型:
r.text : 获取网站源码
r.encode : 访问或定制编码方式
r.url : 获取响应的url
r.content: 响应的字节类型
r.status_code : 响应的状态码
r.header : 响应的头信息
'''
url = 'https://wish.zhangweishihundan.com/'
response = requests.get(url)
response.encoding = 'utf-8'
content = response.text
tree = etree.HTML(content)
result_list = tree.xpath('//div[@class="sbody"]/text()')
# 保存在文件中
with open('./raw/zhangwei/wish.txt', 'w', encoding='utf-8') as fp:
for temp in result_list:
fp.write(temp + '\n') - requests模拟get请求
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16import requests
url = 'https://www.baidu.com/s'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
data = {
'wd': 'lisa'
}
# 参数值不需要进行编码
response = requests.get(url=url, headers=header, params=data)
response.encoding = 'utf-8'
content = response.text
print(content) # 输出网站源码 - requests模拟post请求
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19import requests
import json
url = 'https://fanyi.baidu.com/sug'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
data = {
'kw': 'love'
}
# 参数值不同get请求参数为params post请求data
response = requests.post(url=url,data=data,headers=header)
response.encoding = 'utf-8'
content = response.text
content = json.loads(content)
print(content['data']) # 输出love的翻译scrapy
- scrapy可以理解为爬虫的框架
在终端输入 scrapy startproject 项目名称 创建新项目
- 在spiders文件夹下创建爬虫文件
在终端进入项目的spiders文件夹下scrapy genspider 文件命 网站
1
2
3
4
5
6
7
8
9
10
11
12
13
14import scrapy
class BaiduSpider(scrapy.Spider):
# 爬虫的名字 用于运行爬虫时使用的值
name = 'baidu'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com/']
# 执行start_urls之后 response 就是返回的那个对象
# response 相当于 urllibr.request.urlopen()
def parse(self, response):
pass
- 在spiders文件夹下创建爬虫文件
- 运行爬虫
在终端进入项目的spiders文件夹下scrapy crawl 爬虫名字
案例
获取影响联盟英雄皮肤图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74"""
Date: 2022.11.7
Author: panther
Language: python3
"""
import requests
import re
import json
import os
def getHtml(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
except:
print(url + "爬取失败!")
else:
response = r.text
getInfo(response)
def getInfo(res):
lists = re.findall(r'"keys":(.*?),"data"', res)
# print(lists)
hero_id = json.loads(lists[0])
# print(hero_id)
for hero in hero_id.values():
getSkin(hero)
def getSkin(hero):
url = 'https://lol.qq.com/biz/hero/' + hero + '.js'
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
except:
print(url + "爬取失败!")
else:
html = r.text
num = re.findall(r'"id":"(\d{4,6})","num"', html)
for i in range(len(num)):
img_url = 'https://game.gtimg.cn/images/lol/act/img/skin/big' + num[i] + '.jpg'
save_img(hero, img_url)
def save_img(hero, img_url):
root = hero + "\\"
path = root + img_url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(img_url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功!")
else:
print("文件已存在!")
except:
print("爬取失败!")
print(img_url + "已下载")
def main():
url = "https://lol.qq.com/biz/hero/champion.js"
getHtml(url)
if __name__ == "__main__":
main()
- 运行爬虫
Comment