Python爬虫入门

1、Requests

1、安装

1	pip install requests

2、发送请求

import requests

url = 'https://www.baidu.com'
# GET 方法
response = requests.get(url)

# POST 方法
response = requests.post(url)

# DELETE 方法
response = requests.delete(url)

# PATCH 方法
response = requests.patch(url)

# PUT 方法
response = requests.put(url)

3、传递`URL`参数

import requests

url = 'https://www.baidu.com'
params = {'key1': 'value1', 'key2': 'value2'}
response = requests.get(url, params=params)
print(response.url)

4、响应内容

import requests

url = 'https://www.baidu.com'
response = requests.get(url)

# 响应内容(默认Unicode编码)
print(response.text)

# 修改编码
response.encoding = 'utf-8'
print(response.encoding)

# 二进制响应内容
print(response.content)

# Json 响应内容
print(response.json())

# 获取 status code
print(response.status_code)

# 获取 headers
print(response.headers)
print(response.headers['Content-Type'])
print(response.headers.get('Content-Type'))

# 获取 cookies
print(response.cookies)

5、自定义请求头和Cookie

import requests

url = 'https://www.baidu.com'
headers = {
    'User-agent': 'my-app/0.0.1',
    'Cookie': 'cookies'
}
cookies = {
    'cookies_are': 'working'
}
response = requests.get(url, headers=headers, cookies=cookies)

6、重定向和超时

import requests

url = 'https://www.baidu.com'
# allow_redirects 为重定向参数,当为False时不会自动重定向
response = requests.get(url, allow_redirects=False)
print(response.url)

# timeout 为超时参数，当超时后会抛出异常
try:
    response = requests.get(url, timeout=0.01)
    print(response.url)
except Exception as error:
    # print(error)
    print('请求超时')

7、携带数据的`POST`请求

import requests

url = 'https://www.baidu.com'
data = {
    'key1': 'value1',
    'key2': 'value2'
}
response = requests.post(url, data=data)

8、下载文件

# 下载百度logo图片
response = requests.get('https://www.baidu.com/img/bd_logo1.png')
filename = 'bd_logo1.png'
try:
    file = open('./Others/' + filename, 'ab')
    file.write(response.content)
    file.close()
except Exception as error:
    print(str(error))

2、BeautifulSoup

参考文档

1、安装

1、安装库

# pip安装
pip install beautifulsoup4
# apt-get安装
sudo apt-get install Python-bs4

2、安装解析器

# pip安装
pip install lxml
pip install html5lib
# apt-get安装
sudo apt-get install Python-lxml
sudo apt-get install Python-html5lib

2、创建一个`BeautifulSoup`对象

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 修改编码
soup.encode('utf-8')

# HTML 格式化输出
print(soup.prettify())

3、获取网页中的元素

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 通过标签名称来获取元素
print(soup.find_all(name='title'))

# 通过 id 来获取元素
print(soup.find_all(id='link2'))

# 通过 css 来获取元素
print(soup.find_all(class_='title'))

# 通过属性来获取元素
print(soup.find_all(attrs={'class': 'sister'}))

# 通过 css selector 来获取元素
print(soup.select('title'))

# 通过是否存在某个属性值来查找
print(soup.select('a[href]'))

# 通过属性的值来查找
print(soup.select('a[href="http://example.com/elsie"]'))
print(soup.select('a[href^="http://example.com/"]'))
print(soup.select('a[href$="tillie"]'))
print(soup.select('a[href*=".com/el"]'))

# 通过元素自定义属性来查找
print(soup.find_all(attrs={"attr": "value"}))

4、获取元素内容

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# 获取指定元素
tag = soup.find(id='link1')

# 打印整个标签
print(tag)

# 获取标签名称
print(tag.name)

# 获取标签中指定属性的值
print(tag['href'])

# 获取标签中所有文字内容
print(tag.get_text())

3、Selenium

selenium主要是用来做自动化测试，支持多种浏览器，爬虫中主要用来解决JavaScript渲染问题。

1、安装

# pip安装
pip install selenium
# apt-get安装
sudo apt-get install Python-selenium

2、调用浏览器

1、Firefox浏览器

下载驱动

geckdriver

Windows
Linux

将下载的文件解压得到geckodriver ，将geckdriver 放到Python的安装路径即可(任意有换件变量的路径就行)

将下载的文件解压得到geckodriver，将geckdriver复制到/usr/local/bin 并修改权限

1 2	sudo cp geckodriver /usr/local/bin sudo chmod 777 /usr/local/bin/geckodriver

import time
from selenium import webdriver

url = 'http://www.baidu.com'
# 设置浏览器属性，无窗口化后台运行
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
# 打开浏览器
# browser = webdriver.Firefox()
browser = webdriver.Firefox(firefox_options=options)
# 访问网页
browser.get(url)
# 休眠2两秒
time.sleep(2)
# 退出浏览器
browser.quit()

2、Chrome浏览器

驱动下载地址

Chrome版本号：

驱动版本号：
到下载地址下载对应版本

</div>

Windows
Linux

将下载的文件解压得到geckodriver ，将geckdriver 放到Python的安装路径即可(任意有换件变量的路径就行)

将下载的文件解压得到chromedriver，将chromedriver复制到/usr/local/bin 并修改权限

sudo cp chromedriver /usr/local/bin
sudo chmod 777 /usr/local/bin/chromedriver
# 或者直接用 apt-get 安装
sudo apt-get install chromium-chromedriver

import time
from selenium import webdriver

url = 'http://www.baidu.com'
# 设置浏览器属性，无窗口化后台运行
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# 打开浏览器
browser = webdriver.Chrome(chrome_options=options)
# 访问网页
browser.get(url)
# 休眠2两秒
time.sleep(2)
# 退出浏览器
browser.quit()

3、`webdriver`常用操作

from selenium import webdriver

url = 'http://www.baidu.com'
browser = webdriver.Firefox()
# 在当前浏览器会话中访问传入的url地址
browser.get(url)

# 刷新当前页面
browser.refresh()

# 执行 JavaScript 语句
browser.execute_script('js')

# 等待渲染(等待10秒)
browser.implicitly_wait(10)

# 获取当前页面的url
browser.current_url

# 获取当前页的标题
browser.title

# 获取当前页渲染后的源代码
browser.page_source

# 获取当前会话中所有窗口的句柄
browser.window_handles

# 关闭浏览器当前窗口
browser.close()
# 退出webdriver并关闭所有窗口
browser.quit()

4、获取网页元素

# 获取单个元素
element = browser.find_element_by_id('id')
element = browser.find_element_by_name('name')
element = browser.find_element_by_xpath('xpath')
element = browser.find_element_by_link_text('link text')
element = browser.find_element_by_tag_name('tag name')
element = browser.find_element_by_class_name('class name')
element = browser.find_element_by_css_selector('css selector')
element = browser.find_element_by_partial_link_text('partial link text')
# 获取多个元素
elements = browser.find_elements_by_id('id')
elements = browser.find_elements_by_name('name')
elements = browser.find_elements_by_xpath('xpath')
elements = browser.find_elements_by_link_text('link text')
elements = browser.find_elements_by_tag_name('tag name')
elements = browser.find_elements_by_class_name('class name')
elements = browser.find_elements_by_css_selector('css selector')
elements = browser.find_elements_by_partial_link_text('partial link text')

方法	作用
find_element_by_id()	通过id查找
find_element_by_name()	通过name属性进行查找
find_element_by_xpath()	通过Xpath查找
find_element_by_link_text()	通过链接文本查找
find_element_by_tag_name()	通过标签名查找
find_element_by_class_name()	通过class属性查找
find_element_by_css_selector()	通过css选择器查找
find_element_by_partial_link_text()	通过链接文本的部分匹配查找

5、`Webelement`常用方法和属性

# 常用方法
# 清空对象中的内容
element.clear()
# 给对象元素输入数据
element.send_keys('value')
# 单击对象
element.click()
# 提交表单
element.submit()
# 优先返回完全匹配属性名的值，如果不存在，则返回属性名中包含name的值
element.get_attribute('name')
# 获取当前元素的截图，保存为png
element.screenshot('filename')

# 常用属性
# 获取当前元素的文本内容
element.text
# 获取当前元素的标签名
element.tag_name
# 当前元素的id值
element.id
# 当前元素的位置
element.location
# 获取当前元素的大小
element.size
# 获取一个包含当前元素大小和位置的字典
element.rect
# 获取当前元素的父节点
element.parent
# 将当前元素截屏并保存为png格式的二进制数据
element.screenshot_as_png
# 将当前元素截屏并保存为base64编码的字符串
element.screenshot_as_base64

-1、爬虫练习网站

1、`HTTP`请求练习

httpbin

因为httpbin服务器在国外，所以非常慢，可以使用docker在本地运行。

1	sudo docker run -p 80:80 kennethreitz/httpbin

0、参考

Beautiful Soup 4.2.0 文档

Selenium with Python中文翻译文档

1、Requests

1、安装

2、发送请求

3、传递URL参数

4、响应内容

5、自定义请求头和Cookie

6、重定向和超时

7、携带数据的POST请求

8、下载文件

2、BeautifulSoup

1、安装

2、创建一个BeautifulSoup对象

3、获取网页中的元素

4、获取元素内容

3、Selenium

1、安装

2、调用浏览器

1、Firefox浏览器

2、Chrome浏览器

3、webdriver常用操作

4、获取网页元素

5、Webelement常用方法和属性

-1、爬虫练习网站

1、HTTP请求练习

0、参考

3、传递`URL`参数

7、携带数据的`POST`请求

2、创建一个`BeautifulSoup`对象

3、`webdriver`常用操作

5、`Webelement`常用方法和属性

1、`HTTP`请求练习