pythone爬真实自拍

前几天有为老兄发了个真实自拍.com,发现站上面的图都挺不错的,但是看起来有不够爽,就改了个爬图脚本爬了14多G的图片。
有需要的福娃可以自己爬下。D盘建个1024文件夹,跑起来就可以了,26楼的大哥代码大家也可以试试。页数越后的图床失效机率越大
不说了,我去买纸巾了。

import urllib.request,socket,re,sys,os,pathlib,time,random

baseUrl=’https://xn--qbt00o3ns2fk.xyz/’

targetPath = “D:1024”

def getContant(Weburl):

Webheader= {‘Upgrade-Insecure-Requests’:’1′,

‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36′,}

req = urllib.request.Request(url = Weburl,headers=Webheader)

respose = urllib.request.urlopen(req)

_contant = respose.read()

respose.close()

return str(_contant)

def gettitle(url):

headers = {

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ ‘Chrome/51.0.2704.63 Safari/537.36′

}

req=urllib.request.Request(url=url,headers=headers)

file=urllib.request.urlopen(req)

html=file.read().decode(“utf-8”,’ignore’)

title=re.findall(‘

(.+)

‘,html)

return str(title)

def getUrl(URL):

for i in range(1,256):

Weburl = URL

contant = getContant(Weburl)

comp = re.compile(r’content_d*.html’)

urlList1 = comp.findall(contant)

urlList = []

for url1 in urlList1:

url2 = baseUrl+url1

urlList.append(url2)

return urlList

def openUrl(url):

headers = {

‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ ‘Chrome/51.0.2704.63 Safari/537.36′

}

title=gettitle(url)

title=title[2:-2]

filePath=targetPath+title

if os.path.isdir(filePath):

req = urllib.request.Request(url=url, headers=headers)

res = urllib.request.urlopen(req)

data = res.read()

downImg(data,filePath)

else:

os.mkdir(filePath)

req = urllib.request.Request(url=url, headers=headers)

res = urllib.request.urlopen(req)

data = res.read()

downImg(data,filePath)

def downImg(data,filePath):

for link,t in set(re.findall(r'([http|https]:[^s]*?(jpg|png))’, str(data))):

if link.startswith(‘s’):

link=’http’+link

else:

link=’htt’+link

print(link)

strpicpath=seFile(link,filePath)

picpath=pathlib.Path(strpicpath)

if not picpath.exists():

socket.setdefaulttimeout(30)

try:

try :

opener=urllib.request.build_opener()

opener.addheaders=[(‘User-Agent’,’Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36′)]

urllib.request.install_opener(opener)

urllib.request.urlretrieve(link,strpicpath)

time.sleep(random.uniform(0,0.5))

except:

print(“faild!”)

except socket.timeout:

count = 1

while count <= 5: try: urllib.request.urlretrieve(link,strpicpath) break except socket.timeout: count += 1 if count > 5:

print(“faild”)

else:

print(“exist”)

def seFile(path,filePath):

pos = path.rindex(‘/’)

t = os.path.join(filePath,path[pos+1:])

return t

def openPage(UrlList):

for pageUlr in UrlList:

socket.setdefaulttimeout(30)

try:

try:

print(pageUlr)

openUrl(pageUlr)

## article_url=re.findall(r’

(?!<.*>).*

‘,str(get.content,’gbk’,errors=’ignore’))

except:

print(pageUlr+’ faild’)

except socket.timeout:

count = 1

while count <= 5: try: openUrl(pageUlr) ## article_url=re.findall(r'

(?!<.*>).*

‘,str(get.content,’gbk’,errors=’ignore’))

break

except socket.timeout:

count += 1

if count > 5:

print(pageUlr+’ faild’)

URL = baseUrl+’index_’

for num in range(1,255):

try:

os.system(‘cls’)

print(“#######################################”)

print(“##########download#############################”)

print(URL+str(num)+’.html’)

print(“#######################################”)

print(“#######################################”)

UrlList = getUrl(URL+str(num)+’.html’)

openPage(UrlList)

except:

print(‘faild’)复制代码

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注