本文最后更新于:2022年2月26日 15:13
学习选择
基础:先看的这个,基础操作,用的idle,有些跳了步骤,可能东西比较老(requests、解析(beautiful)、Re、Scrapy)Python数据分析与展示,北京理工大学,中国大学MOOC(慕课) (icourse163.org)
进阶:建议不如直接看这个 ,讲得更细节,用的pycharm,东北老师,讲课老精神了(直接看p51-104,对于基础,补充了Urllib、解析(xpath、jsonpath)、selenium)尚硅谷Python爬虫教程小白零基础速通(含python基础+爬虫案例),哔哩哔哩,bilibili
获取 urllib
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 import urllib.requestimport urllib.parse url = 'https://www.baidu.com/s?wd=' headers = { 'User‐Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } url = url + urllib.parse.quote('小野' ) request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request)print (response.read().decode('utf‐8' ))import urllib.requestimport urllib.parse url = 'http://www.baidu.com/s?' data = { 'name' :'小刚' , 'sex' :'男' , } data = urllib.parse.urlencode(data) url = url + dataprint (url) headers = { 'User‐Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 import urllib.requestimport urllib.parse url = 'https://fanyi.baidu.com/sug' headers = { 'user‐agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } keyword = input ('请输入您要查询的单词' ) data = { 'kw' :keyword } data = urllib.parse.urlencode(data).encode('utf‐8' ) request = urllib.request.Request(url=url,headers=headers,data=data) response = urllib.request.urlopen(request)print (response.read().decode('utf‐8' ))import json obj = json.loads(content) s = json.dumps(obj,ensure_ascii=False )print (s)
ajax的get请求(前后端分离的情况,可以拿到json)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 import urllib.requestimport urllib.parsedef create_request (page ): base_url = 'https://movie.douban.com/j/chart/top_list?type=20&interval_id=100%3A90&action=&' headers = { 'User‐Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' } data={ 'start' :(page‐1 )*20 , 'limit' :20 } data = urllib.parse.urlencode(data) url = base_url + data request = urllib.request.Request(url=url,headers=headers) return requestdef get_content (request ): response = urllib.request.urlopen(request) content = response.read().decode('utf‐8' ) return contentdef down_load (page,content ): with open ('douban_' +str (page)+'.json' ,'w' ,encoding='utf‐8' )as fp: fp.write(content) if __name__ == '__main__' : start_page = int (input ('请输入起始页码' )) end_page = int (input ('请输入结束页码' )) for page in range (start_page,end_page+1 ): request = create_request(page) content = get_content(request) down_load(page,content)
异常错误
HTTPError类是URLError类的子类
导入的包urllib.error.HTTPError urllib.error.URLError
http错误:http错误是针对浏览器无法连接到服务器而增加出来的错误提示。引导并告诉浏览者该页是哪里出 了问题。
通过urllib发送请求的时候,有可能会发送失败,这个时候如果想让你的代码更加的健壮,可以通过try‐ except进行捕获异常,异常有两类,URLError\HTTPError
cookie登录
cookie 跳过登录
refer 防盗链
handle 定制更高级的请求头(随着业务逻辑的复杂 请求对象的定制已经满足不了我们的需求(动态cookie和代理 不能使用请求对象的定制)
代理服务器(突破自身IP访问限制,访问国外站点。访问一些单位或团体内部资源。提高访问速度。隐藏真实IP)(代理池\快代理)
1 2 3 4 5 6 7 8 9 10 11 12 13 import urllib.request url = 'http://www.baidu.com/s?wd=ip' headers = { 'User ‐ Agent' : 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 74.0.3729.169Safari / 537.36' } request = urllib.request.Request(url=url,headers=headers) proxies = {'http' :'117.141.155.244:53281' } handler = urllib.request.ProxyHandler(proxies=proxies) opener = urllib.request.build_opener(handler) response = opener.open (request) content = response.read().decode('utf‐8' )with open ('daili.html' ,'w' ,encoding='utf‐8' )as fp: fp.write(content)
requests
1 2 proxy = {'http' :'219.149.59.250:9797' } r = requests.get(url=url,params=data,headers=headers,proxies=proxy)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 import requestsdef getHTMLText (url ): try : r=requests.get(url, timeout=30 ) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except : return "产生异常" if __name__ == "__main__" : url = "http://www.baidu.com" print (getHTMLText(url))import requests url = "https://www.amazon.cn/gp/product/B01M8L5Z3Y" try : kv = {'user-agent' :'Mozilla/5.0' } r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding print (r.text[1000 :2000 ])except : print ("爬取失败" )import requests keyword = "Python" url ='http://www.baidu.com/s' try : kv = {'wd' : 'keyword' } r = requests.get(url, params=kv) print (r.status_code) print (r.request.url) r.raise_for_status() print (len (r.text))except : print ("爬取失败" ) import requestsimport os url = "http://xwzx.cumt.edu.cn/_upload/article/images/2f/99/d44299934d00afe8f03684d5c59b/f682d3bc-972c-4d5a-9542-c52e0b72032f.jpg" root = "D://pics//" path = root + url.split('/' )[-1 ]try : if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) with open (path, 'wb' ) as f: f.write(r.content) f.close() print ("文件保存成功" ) else : print ("文件已存在" )except : print ("爬取失败" )import requests url = "https://ipchaxun.com/" try : r = requests.get(url+'202.204.80.112' ) r.raise_for_status() r.encoding = r.apparent_encoding print (r.text[-500 :])except : print ("爬取失败" )
解析 xpath
xpath基本语法
路径查询
//:查找所有子孙节点,不考虑层级关系
/ :找直接子节点
谓词查询
//div[@id]
//div[@id=”maincontent”]
属性查询
//@class
模糊查询
//div[contains(@id, “he”)]
//div[starts‐with(@id, “he”)]
内容查询
//div/h1/text()
逻辑运算
//div[@id=”head” and @class=”s_down”]
//title | //price (其实这是列表的用法)
1 2 3 4 5 6 from lxml import etree html_tree = etree.parse('XX.html' ) html_tree = etree.HTML(response.read().decode('utf‐8' ) html_tree.xpath([xpath路径])
jsonpath 教程连接(http://blog.csdn.net/luxideyao/article/details/77802389)
Beautiful Soup
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 import requestsfrom bs4 import BeautifulSoupimport bs4def getHTMLText (url ): try : r = requests.get(url, timeout=30 ) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except : return "" def fillUnivList (ulist, html ): soup = BeautifulSoup(html, "html.parser" ) for tr in soup.find('tbody' ).children: if isinstance (tr, bs4.element.Tag): tds = tr('td' ) ulist.append([tds[0 ].string, tds[1 ].string, tds[3 ].string])def printUnivList (ulist, num ): tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" print (tplt.format ("排名" ,"学校名称" ,"总分" ,chr (12288 ))) for i in range (num): u=ulist[i] print (tplt.format (u[0 ],u[1 ],u[2 ],chr (12288 ))) def main (): uinfo = [] url = 'https://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' html = getHTMLText(url) fillUnivList(uinfo, html) printUnivList(uinfo, 20 ) main()
Re
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 import requestsimport redef getHTMLText (url ): try : kv = \ { 'user-agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76' , 'Cookie' :'【去审查元素自己看看】' } r = requests.get(url, timeout=30 , headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except : print ("getHTMLText" ) return "" def parsePage (ilt, html ): try : plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"' ,html) tlt = re.findall(r'\"raw_title\"\:\".*?\"' ,html) for i in range (len (plt)): price = eval (plt[i].split(':' )[1 ]) title = eval (tlt[i].split(':' )[1 ]) ilt.append([price , title]) except : print ("parsePage" )def printGoodsList (ilt ): tplt = "{:4}\t{:8}\t{:16}" print (tplt.format ("序号" , "价格" , "商品名称" )) count = 0 for g in ilt: count = count + 1 print (tplt.format (count, g[0 ], g[1 ])) goods = '书包' depth = 3 start_url = 'https://s.taobao.com/search?q=' + goods infoList = []for i in range (depth): try : url = start_url + '&s=' + str (44 *i) html = getHTMLText(url) parsePage(infoList, html) except : continue printGoodsList(infoList)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 import requestsfrom bs4 import BeautifulSoupimport tracebackimport redef getHTMLText (url, code="utf-8" ): try : r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except : return "" def getStockList (lst, stockURL ): html = getHTMLText(stockURL, "GB2312" ) soup = BeautifulSoup(html, 'html.parser' ) a = soup.find_all('a' ) for i in a: try : href = i.attrs['href' ] lst.append(re.findall(r"[s][hz]\d{6}" , href)[0 ]) except : continue def getStockInfo (lst, stockURL, fpath ): count = 0 for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url) try : if html=="" : continue infoDict = {} soup = BeautifulSoup(html, 'html.parser' ) stockInfo = soup.find('div' ,attrs={'class' :'stock-bets' }) name = stockInfo.find_all(attrs={'class' :'bets-name' })[0 ] infoDict.update({'股票名称' : name.text.split()[0 ]}) keyList = stockInfo.find_all('dt' ) valueList = stockInfo.find_all('dd' ) for i in range (len (keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val with open (fpath, 'a' , encoding='utf-8' ) as f: f.write( str (infoDict) + '\n' ) count = count + 1 print ("\r当前进度: {:.2f}%" .format (count*100 /len (lst)),end="" ) except : count = count + 1 print ("\r当前进度: {:.2f}%" .format (count*100 /len (lst)),end="" ) continue def main (): stock_list_url = 'https://quote.eastmoney.com/stocklist.html' stock_info_url = 'https://gupiao.baidu.com/stock/' output_file = 'D:/BaiduStockInfo.txt' slist=[] getStockList(slist, stock_list_url) getStockInfo(slist, stock_info_url, output_file) main()
python爬取淘宝商品信息&requests.get()和网页源代码不一致_jzj_c_love的博客-CSDN博客
Selenium
selenium的使用步骤?
导入:from selenium import webdriver
创建谷歌浏览器操作对象: path = 谷歌浏览器驱动文件路径 browser = webdriver.Chrome(path)
访问网址 url = 要访问的网址 browser.get(url)
selenium的元素定位 find_element
访问元素信息
获取元素属性 .get_attribute(‘class’)
获取元素文本 .text
获取标签名 .tag_name
交互
点击:click()
输入:send_keys()
后退操作:browser.back()
前进操作:browser.forword()
模拟JS滚动: js=’document.documentElement.scrollTop=100000’ browser.execute_script(js)
执行js代码 获取网页代码:page_source
退出:browser.quit()
Chrome handless
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsdef share_browser (): chrome_options = Options() chrome_options.add_argument('‐‐headless' ) chrome_options.add_argument('‐‐disable‐gpu' ) path = r'[---]' chrome_options.binary_location = path browser = webdriver.Chrome(chrome_options=chrome_options) return browserfrom handless import share_browser browser = share_browser() browser.get('http://www.baidu.com/' ) browser.save_screenshot('handless1.png' )
Scrapy
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 import scrapyfrom scrapy_dangdang_095.items import ScrapyDangdang095Itemclass DangSpider (scrapy.Spider ): name = 'dang' allowed_domains = ['category.dangdang.com' ] start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html' ] base_url = 'http://category.dangdang.com/pg' page = 1 def parse (self, response ): li_list = response.xpath('//ul[@id="component_59"]/li' ) for li in li_list: src = li.xpath('.//img/@data-original' ).extract_first() if src: src = src else : src = li.xpath('.//img/@src' ).extract_first() name = li.xpath('.//img/@alt' ).extract_first() price = li.xpath('.//p[@class="price"]/span[1]/text()' ).extract_first() book = ScrapyDangdang095Item(src=src,name=name,price=price) yield book if self.page < 100 : self.page = self.page + 1 url = self.base_url + str (self.page) + '-cp01.01.02.00.00.00.html' yield scrapy.Request(url=url,callback=self.parse)
1 2 3 4 5 6 7 class ScrapyDangdang095Item (scrapy.Item ): src = scrapy.Field() name = scrapy.Field() price = scrapy.Field()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 from itemadapter import ItemAdapterclass ScrapyDangdang095Pipeline : def open_spider (self,spider ): self.fp = open ('book.json' ,'w' ,encoding='utf-8' ) def process_item (self, item, spider ): self.fp.write(str (item)) return item def close_spider (self,spider ): self.fp.close()import urllib.requestclass DangDangDownloadPipeline : def process_item (self, item, spider ): url = 'http:' + item.get('src' ) filename = './books/' + item.get('name' ) + '.jpg' urllib.request.urlretrieve(url = url, filename= filename) return item
1 2 3 4 5 6 7 ITEM_PIPELINES = { 'scrapy_dangdang_095.pipelines.ScrapyDangdang095Pipeline' : 300 , 'scrapy_dangdang_095.pipelines.DangDangDownloadPipeline' :301 }