閱讀目錄

requests庫使用流程
分析案例
- 需求：爬取搜狗首頁(yè)的頁(yè)面數據
- 需求：爬取搜狗指定詞條對應的搜索結果頁(yè)面
- 需求：爬取豆瓣電影分類(lèi)排行榜中的電影詳情數據
- 爬取國家藥品監督管理總局數據
- 爬取set-cookies數據

回到頂部

requests庫使用流程

使用流程/編碼流程

1.指定url

2.基于requests模塊發(fā)起請求

3.獲取響應對象中的數據值

4.持久化存儲

回到頂部

分析案例

需求：爬取搜狗首頁(yè)的頁(yè)面數據

# 爬取搜狗首頁(yè)

import requests

if __name__ == '__main__':
    # step_1:指定url
    sogou_url = 'https://www.sogou.com/'
    # step_2: 發(fā)起請求:使用get方法發(fā)起get請求，該方法會(huì )返回一個(gè)響應對象。參數url表示請求對應的url
    response = requests.get(url=sogou_url)
    # step_3:獲取響應數據:通過(guò)調用響應對象的text屬性，返回響應對象中存儲的字符串形式的響應數據（頁(yè)面源碼數據）
    page_text = response.text
    # step_4:持久化存儲
    with open('./sogou.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    print('爬取數據完畢?。?！')

需求：爬取搜狗指定詞條對應的搜索結果頁(yè)面

# 爬取搜狗指定詞條對應的搜索結果頁(yè)面（簡(jiǎn)易網(wǎng)頁(yè)采集器）

# 反爬機制
#
# User-Agent：請求載體的身份標識，使用瀏覽器發(fā)起的請求，請求載體的身份標識為瀏覽器，
#             使用爬蟲(chóng)程序發(fā)起的請求，請求載體為爬蟲(chóng)程序。
#
# UA檢測：相關(guān)的門(mén)戶(hù)網(wǎng)站通過(guò)檢測請求該網(wǎng)站的載體身份來(lái)辨別該請求是否為爬蟲(chóng)程序，
#          如果是，則網(wǎng)站數據請求失敗。
#          因為正常用戶(hù)對網(wǎng)站發(fā)起的請求的載體一定是基于某一款瀏覽器，
#          如果網(wǎng)站檢測到某一請求載體身份標識不是基于瀏覽器的，則讓其請求失敗。
#          因此，UA檢測是我們整個(gè)課程中遇到的第二種反爬機制，第一種是robots協(xié)議。
#
# UA偽裝：通過(guò)修改/偽裝爬蟲(chóng)請求的User-Agent來(lái)破解UA檢測這種反爬機制
import requests
# 指定搜索關(guān)鍵字
word = input('enter a word you want to search:')
# 自定義請求頭信息:UA偽裝,將包含了User-Agent的字典作用到請求方法的headers參數中即可
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
# 指定url，原始url可能是https://www.sogou.com/web?query=qq，發(fā)現該url攜帶了參數
url = 'https://www.sogou.com/web'
# 封裝get請求參數：如果請求攜帶了參數，則可以將參數封裝到字典中結合這requests請求方法中的data/params參數進(jìn)行url參數的處理
param = {
    'query': word,
}
# 發(fā)起請求
response = requests.get(url=url, params=param, headers=headers)
# 獲取響應數據
page_text = response.text
# 持久化存儲
fileName = word + '.html'
with open(fileName, 'w', encoding='utf-8') as fp:
    fp.write(page_text)
print('爬取數據完畢?。?！', fileName)

需求：爬取豆瓣電影分類(lèi)排行榜中的電影詳情數據

# 抓取豆瓣電影分類(lèi)排行榜 - 動(dòng)作片
import requests
if __name__ == "__main__":
    # 指定ajax-get請求的url（通過(guò)抓包進(jìn)行獲?。?/span>
    url = 'https://movie.douban.com/j/chart/top_list?'
    # 定制請求頭信息，相關(guān)的頭信息必須封裝在字典結構中
    headers = {
        # 定制請求頭中的User-Agent參數，當然也可以定制請求頭中其他的參數
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    }
    # 定制get請求攜帶的參數(從抓包工具中獲取)
    param = {
        'type': '5',
        'interval_id': '100:90',
        'action': '',
        'start': '0',
        'limit': '20'
    }
    # 發(fā)起get請求，獲取響應對象
    response = requests.get(url=url, headers=headers, params=param)
    # 獲取響應內容
    print(response.json())

爬取國家藥品監督管理總局數據

#需求：爬取國家藥品監督管理總局中基于中華人民共和國化妝品生產(chǎn)許可證相關(guān)數據http://125.35.6.84:81/xk/

import requests
import json
if __name__ == "__main__":
    # 指定ajax-post請求的url（通過(guò)抓包進(jìn)行獲?。?/span>
    # 首頁(yè)中信息通過(guò)ajax獲取
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    # 定制請求頭信息，相關(guān)的頭信息必須封裝在字典結構中
    headers = {
        # 定制請求頭中的User-Agent參數，當然也可以定制請求頭中其他的參數
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    }
    pageNum = 3
    all_id_list = []
    for page in range(3, 5):
        data = {
            'on': 'true',
            'page': str(page),
            'pageSize': '15',
            'productName': '',
            'conditionType': '1',
            'applyname': '',
            'applysn': ''
        }
        json_text = requests.post(url=url, data=data, headers=headers).json()

        # 詳情頁(yè)信息：url和域名一樣 ，id不一樣
        # id值通過(guò)首頁(yè)對應ajax請求得到的json串中獲取
        # 通過(guò)域名和id名拼接成新的url
        for dict in json_text['list']:
            id = dict['ID']  # 用于二級頁(yè)面數據獲取
            # 下列詳情信息可以在二級頁(yè)面中獲取
            # name = dict['EPS_NAME']
            # product = dict['PRODUCT_SN']
            # man_name = dict['QF_MANAGER_NAME']
            # d1 = dict['XC_DATE']
            # d2 = dict['XK_DATE']
            all_id_list.append(id)
            # 該url是一個(gè)ajax的post請求
    print(len(all_id_list))
    post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    for id in all_id_list:
        post_data = {
            'id': id
        }
        response = requests.post(url=post_url, data=post_data, headers=headers)
        if response.headers['Content-Type'] == 'application/json;charset=UTF-8':
            # print(response.json())
            # 進(jìn)行json解析
            json_text = response.json()
            print(json_text['epsName']+','+ json_text['businessPerson'])

爬取set-cookies數據

username='XX'
userCode='XXX'
userPsw='XXXXXX 
    

headers = {
     'Accept': 'image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*',
     'Accept-Encoding': 'gzip, deflate',
     'Accept-Language': 'zh-CN',
     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; Tablet PC 2.0)'
        }

#userlogin='CN%3D%u6768%u6D69/O%3Dsipf'
userlogin='CN'+parse.quote('=')+str(username.encode("unicode_escape"),encoding='utf-8').replace('\\','%')+'/O'+parse.quote('=')+'sipf'
'''
獲得set-cookies
'''
login_url = 'http://www.xxx.com/SSO/LoginAction.action'
post_data={
        'userCode':userCode,
        'userPsw':userPsw,
        '__checkbox_cookieFlg':'true',
        'Submit.x':'43',
        'Submit.y':'13'
        
        }

login_response= requests.post(login_url,data=post_data, headers=headers)
#cookier_url   
re_cookier_url=r'<IFRAME border=0 name=fw_list.*?src=(.*?)&RedirectTo=.*?>'
pa_url = re.compile(re_cookier_url,re.S)  # 包括換行字符匹配
cookier_url = pa_url.findall(login_response.text) [0]# 得到cookier_url鏈接
print(cookier_url)

session_url = cookier_url+'&RedirectTo=/xxx/portal.nsf/portalViewFw?openform'
session=requests.session()
session.get(url=session_url, headers=headers)
#獲得set-cookies
html_set_cookies= requests.utils.dict_from_cookiejar(session.cookies)
print(html_set_cookies.get('DomAuthSessId'))
#user='CN=楊浩/O=sipf'
#print(parse.quote(user.encode("unicode_escape")))
DomAuthSessId=html_set_cookies.get('DomAuthSessId')

cookier = {
        'userlogin':userlogin,
        'DomAuthSessId':DomAuthSessId
        }

本站僅提供存儲服務(wù)，所有內容均由用戶(hù)發(fā)布，如發(fā)現有害或侵權內容，請點(diǎn)擊舉報。

欧美性猛交XXXX免费看蜜桃,成人网18免费韩国,亚洲国产成人精品区综合,欧美日韩一区二区三区高清不卡,亚洲综合一区二区精品久久

閱讀目錄

requests庫使用流程

分析案例

需求：爬取搜狗首頁(yè)的頁(yè)面數據

需求：爬取搜狗指定詞條對應的搜索結果頁(yè)面

需求：爬取豆瓣電影分類(lèi)排行榜中的電影詳情數據

爬取國家藥品監督管理總局數據

爬取set-cookies數據