0%

Python爬虫入门

1、Requests

1、安装

1
pip install requests

2、发送请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests

url = 'https://www.baidu.com'
# GET 方法
response = requests.get(url)

# POST 方法
response = requests.post(url)

# DELETE 方法
response = requests.delete(url)

# PATCH 方法
response = requests.patch(url)

# PUT 方法
response = requests.put(url)

3、传递URL参数

1
2
3
4
5
6
import requests

url = 'https://www.baidu.com'
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get(url, params=params)
print(response.url)

4、响应内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests

url = 'https://www.baidu.com'
response = requests.get(url)

# 响应内容(默认Unicode编码)
print(response.text)

# 修改编码
response.encoding = 'utf-8'
print(response.encoding)

# 二进制响应内容
print(response.content)

# Json 响应内容
print(response.json())

# 获取 status code
print(response.status_code)

# 获取 headers
print(response.headers)
print(response.headers['Content-Type'])
print(response.headers.get('Content-Type'))

# 获取 cookies
print(response.cookies)

5、自定义请求头和Cookie

1
2
3
4
5
6
7
8
9
10
11
import requests

url = 'https://www.baidu.com'
headers = {
'User-agent': 'my-app/0.0.1',
'Cookie': 'cookies'
}
cookies = {
'cookies_are': 'working'
}
response = requests.get(url, headers=headers, cookies=cookies)

6、重定向和超时

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import requests

url = 'https://www.baidu.com'
# allow_redirects 为重定向参数,当为False时不会自动重定向
response = requests.get(url, allow_redirects=False)
print(response.url)

# timeout 为超时参数,当超时后会抛出异常
try:
response = requests.get(url, timeout=0.01)
print(response.url)
except Exception as error:
# print(error)
print('请求超时')

7、携带数据的POST请求

1
2
3
4
5
6
7
8
import requests

url = 'https://www.baidu.com'
data = {
'key1': 'value1',
'key2': 'value2'
}
response = requests.post(url, data=data)

8、下载文件

1
2
3
4
5
6
7
8
9
# 下载百度logo图片
response = requests.get('https://www.baidu.com/img/bd_logo1.png')
filename = 'bd_logo1.png'
try:
file = open('./Others/' + filename, 'ab')
file.write(response.content)
file.close()
except Exception as error:
print(str(error))

2、BeautifulSoup

参考文档

1、安装

1、安装库

1
2
3
4
# pip安装
pip install beautifulsoup4
# apt-get安装
sudo apt-get install Python-bs4

2、安装解析器

1
2
3
4
5
6
# pip安装
pip install lxml
pip install html5lib
# apt-get安装
sudo apt-get install Python-lxml
sudo apt-get install Python-html5lib

安装解析器

2、创建一个BeautifulSoup对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 修改编码
soup.encode('utf-8')

# HTML 格式化输出
print(soup.prettify())

3、获取网页中的元素

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 通过标签名称来获取元素
print(soup.find_all(name='title'))

# 通过 id 来获取元素
print(soup.find_all(id='link2'))

# 通过 css 来获取元素
print(soup.find_all(class_='title'))

# 通过属性来获取元素
print(soup.find_all(attrs={'class': 'sister'}))

# 通过 css selector 来获取元素
print(soup.select('title'))

# 通过是否存在某个属性值来查找
print(soup.select('a[href]'))

# 通过属性的值来查找
print(soup.select('a[href="http://example.com/elsie"]'))
print(soup.select('a[href^="http://example.com/"]'))
print(soup.select('a[href$="tillie"]'))
print(soup.select('a[href*=".com/el"]'))

# 通过元素自定义属性来查找
print(soup.find_all(attrs={"attr": "value"}))

4、获取元素内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 获取指定元素
tag = soup.find(id='link1')

# 打印整个标签
print(tag)

# 获取标签名称
print(tag.name)

# 获取标签中指定属性的值
print(tag['href'])

# 获取标签中所有文字内容
print(tag.get_text())

3、Selenium

selenium主要是用来做自动化测试,支持多种浏览器,爬虫中主要用来解决JavaScript渲染问题。

1、安装

1
2
3
4
# pip安装
pip install selenium
# apt-get安装
sudo apt-get install Python-selenium

2、调用浏览器

1、Firefox浏览器

下载驱动

geckdriver

将下载的文件解压得到geckodriver ,将geckdriver 放到Python的安装路径即可(任意有换件变量的路径就行)

将下载的文件解压得到geckodriver,将geckdriver复制到/usr/local/bin 并修改权限

1
2
sudo cp geckodriver /usr/local/bin
sudo chmod 777 /usr/local/bin/geckodriver
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import time
from selenium import webdriver

url = 'http://www.baidu.com'
# 设置浏览器属性,无窗口化后台运行
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
# 打开浏览器
# browser = webdriver.Firefox()
browser = webdriver.Firefox(firefox_options=options)
# 访问网页
browser.get(url)
# 休眠2两秒
time.sleep(2)
# 退出浏览器
browser.quit()
2、Chrome浏览器

驱动下载地址


Chrome版本号:


驱动版本号:
到下载地址下载对应版本


</div>

将下载的文件解压得到geckodriver ,将geckdriver 放到Python的安装路径即可(任意有换件变量的路径就行)

将下载的文件解压得到chromedriver,将chromedriver复制到/usr/local/bin 并修改权限

1
2
3
4
sudo cp chromedriver /usr/local/bin
sudo chmod 777 /usr/local/bin/chromedriver
# 或者直接用 apt-get 安装
sudo apt-get install chromium-chromedriver

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import time
from selenium import webdriver

url = 'http://www.baidu.com'
# 设置浏览器属性,无窗口化后台运行
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# 打开浏览器
browser = webdriver.Chrome(chrome_options=options)
# 访问网页
browser.get(url)
# 休眠2两秒
time.sleep(2)
# 退出浏览器
browser.quit()

3、webdriver常用操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from selenium import webdriver

url = 'http://www.baidu.com'
browser = webdriver.Firefox()
# 在当前浏览器会话中访问传入的url地址
browser.get(url)

# 刷新当前页面
browser.refresh()

# 执行 JavaScript 语句
browser.execute_script('js')

# 等待渲染(等待10秒)
browser.implicitly_wait(10)

# 获取当前页面的url
browser.current_url

# 获取当前页的标题
browser.title

# 获取当前页渲染后的源代码
browser.page_source

# 获取当前会话中所有窗口的句柄
browser.window_handles

# 关闭浏览器当前窗口
browser.close()
# 退出webdriver并关闭所有窗口
browser.quit()

4、获取网页元素

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 获取单个元素
element = browser.find_element_by_id('id')
element = browser.find_element_by_name('name')
element = browser.find_element_by_xpath('xpath')
element = browser.find_element_by_link_text('link text')
element = browser.find_element_by_tag_name('tag name')
element = browser.find_element_by_class_name('class name')
element = browser.find_element_by_css_selector('css selector')
element = browser.find_element_by_partial_link_text('partial link text')
# 获取多个元素
elements = browser.find_elements_by_id('id')
elements = browser.find_elements_by_name('name')
elements = browser.find_elements_by_xpath('xpath')
elements = browser.find_elements_by_link_text('link text')
elements = browser.find_elements_by_tag_name('tag name')
elements = browser.find_elements_by_class_name('class name')
elements = browser.find_elements_by_css_selector('css selector')
elements = browser.find_elements_by_partial_link_text('partial link text')
方法 作用
find_element_by_id() 通过id查找
find_element_by_name() 通过name属性进行查找
find_element_by_xpath() 通过Xpath查找
find_element_by_link_text() 通过链接文本查找
find_element_by_tag_name() 通过标签名查找
find_element_by_class_name() 通过class属性查找
find_element_by_css_selector() 通过css选择器查找
find_element_by_partial_link_text() 通过链接文本的部分匹配查找

5、Webelement常用方法和属性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 常用方法
# 清空对象中的内容
element.clear()
# 给对象元素输入数据
element.send_keys('value')
# 单击对象
element.click()
# 提交表单
element.submit()
# 优先返回完全匹配属性名的值,如果不存在,则返回属性名中包含name的值
element.get_attribute('name')
# 获取当前元素的截图,保存为png
element.screenshot('filename')

# 常用属性
# 获取当前元素的文本内容
element.text
# 获取当前元素的标签名
element.tag_name
# 当前元素的id值
element.id
# 当前元素的位置
element.location
# 获取当前元素的大小
element.size
# 获取一个包含当前元素大小和位置的字典
element.rect
# 获取当前元素的父节点
element.parent
# 将当前元素截屏并保存为png格式的二进制数据
element.screenshot_as_png
# 将当前元素截屏并保存为base64编码的字符串
element.screenshot_as_base64

-1、爬虫练习网站

1、HTTP请求练习

httpbin

因为httpbin服务器在国外,所以非常慢,可以使用docker在本地运行。

1
sudo docker run -p 80:80 kennethreitz/httpbin

0、参考

Beautiful Soup 4.2.0 文档

Selenium with Python中文翻译文档