几个简陋的爬虫程序
2017.07.18
stylem
 热度
℃
利用审查元素得到的一个具有翻译功能的爬虫程序
import urllib.request
import urllib.parse
import json
import time
while True:
content = input("请输入要翻译的内容:")
url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc'
data={}
data['type'] = 'AUTO'
data['i'] = content
data['doctype'] = 'json'
data['xmlversion'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['ue'] = 'UTF-8'
data['action'] = 'FY_BY_CL1CKBUTTON'
data['typoResult'] = 'true'
data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.Request(url,data)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.6.2.15784')
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
target = json.loads(html)
target = target['translateResult'][0][0]['tgt']
print(target)
time.sleep(5)
一个爬取百度贴吧图片的程序
#coding=utf-8
import urllib.request
import re
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html.decode('utf-8')
def getImg(html):
reg = r'src="(.+?\.jpg)" style='
imgre = re.compile(reg)
imglist = re.findall(imgre, html)
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'D:\\test\\%s.jpg' % x)
x+=1
html = getHtml("https://tieba.baidu.com/p/2855718935#!/l/p1")
getImg(html)
一个爬取过滤掉图片的段子的程序
import urllib
import urllib.request
import re
url = 'https://www.qiushibaike.com/history/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.6.2.15784'}
request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
pattern=re.compile('<div.*?author clearfix">.*?<h2>(.*?)</h2>.*?<div.*?<span>.*?content">.*?(.*?)</span>.*?</div>.*?<img src="(.*?)" alt=.*?>.*?</i>',re.S)
items = re.findall(pattern,content)
for item in items:
haveImg = re.search("img",item[2])
if not haveImg:
print(item[0],item[1])