盒子
盒子
Posts List
  1. 利用审查元素得到的一个具有翻译功能的爬虫程序
  2. 一个爬取百度贴吧图片的程序
  3. 一个爬取过滤掉图片的段子的程序

几个简陋的爬虫程序

利用审查元素得到的一个具有翻译功能的爬虫程序

import urllib.request
import urllib.parse
import json
import time

while True:
    content = input("请输入要翻译的内容:")
url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc'
data={}
data['type'] = 'AUTO'
data['i'] = content
data['doctype'] = 'json'
data['xmlversion'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['ue'] = 'UTF-8'
data['action'] = 'FY_BY_CL1CKBUTTON'
data['typoResult'] = 'true'


data = urllib.parse.urlencode(data).encode('utf-8')


req = urllib.request.Request(url,data)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.6.2.15784')


response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')

target = json.loads(html)
target = target['translateResult'][0][0]['tgt']

print(target)
time.sleep(5)

一个爬取百度贴吧图片的程序

#coding=utf-8
import urllib.request
import re

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html.decode('utf-8')

def getImg(html):
    reg = r'src="(.+?\.jpg)" style='
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)
    x = 0
    for imgurl in imglist:
        urllib.request.urlretrieve(imgurl,'D:\\test\\%s.jpg' % x)
        x+=1


html = getHtml("https://tieba.baidu.com/p/2855718935#!/l/p1")

getImg(html)     

一个爬取过滤掉图片的段子的程序

import urllib
import urllib.request
import re


url = 'https://www.qiushibaike.com/history/'


headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.6.2.15784'}

request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)



content = response.read().decode('utf-8')
pattern=re.compile('<div.*?author clearfix">.*?<h2>(.*?)</h2>.*?<div.*?<span>.*?content">.*?(.*?)</span>.*?</div>.*?<img src="(.*?)" alt=.*?>.*?</i>',re.S)
items = re.findall(pattern,content)
for item in items:
    haveImg = re.search("img",item[2])
    if not haveImg:
        print(item[0],item[1])
支持一下
扫一扫,支持forsigner