import requests
import os
import re
import time
from multiprocessing.dummy import Pool
from queue import Queue
class DownLoad_Imgs():
"""下载图片"""
def __init__(self, path):
# 主页URL
self.basic_url = 'http://pic.netbian.com'
# 要下载的图片网页
self.html_url = "http://pic.netbian.com/4kmeinv/"
# 请求头
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
# 图片保存位置
self.path = path
self.queue = Queue() # 实例化队列
self.pool = Pool() # 实例化线程池
self.is_running = True # 默认线程正在执行
self.request_num = 0 # 发出的请求数
self.response_num = 0 # 获得的回应数 当回应数等于请求数时结束线程
def get_url_list(self):
"""
拼接URL
:return:
"""
for i in range(1, 21):
# 拼接URL
if i == 1:
url = self.html_url
else:
url = self.html_url + "index_" + str(i) + ".html"
self.queue.put(url)
self.request_num += 1 # 请求数 +1
def get_images_info(self, url):
"""
获取本URL页面所有图片详情页面的URL列表
:param url: 即将下载图片页面的URL
:return: URL列表
"""
resp = requests.get(url, headers=self.header)
content = resp.text
path = '<a href="(/tupian.+?)" target="_blank">'
path = re.compile(path, re.M)
url_list = path.findall(content)
view_urls = []
for url in url_list:
u = self.basic_url + url
view_urls.append(u)
return view_urls
def get_images(self, view_url):
"""
获取图片URL以及标题
:param view_url: 图片详情页面的URL
:return: 图片URL以及名称
"""
path = '<img src="(.+?)" data-pic=".+?title="(.+?)"'
path = re.compile(path)
resp = requests.get(view_url, self.header)
content = resp.content.decode("gbk")
pic_url, pic_title = path.findall(content)[0]
return pic_url, pic_title
def download(self, pic_url_title):
"""
下载图片
:param pic_url_title: 图片URL及标题
:return:
"""
pic_url, pic_title = pic_url_title
# 下载
pic_resp = requests.get(self.basic_url + pic_url, self.header)
image_path = self.path + os.sep + str(pic_title) + '.jpg'
print(pic_url, pic_title)
with open(image_path, 'wb') as f:
f.write(pic_resp.content)
def _run(self):
"""进行一次url地址的请求,提取,保存"""
# 拼接URL
url = self.queue.get()
# 获取图片详情页面URL
view_urls = self.get_images_info(url)
# 获取高清图片URL及标题
for view_url in view_urls:
pic_url_title = self.get_images(view_url)
# 下载图片
self.download(pic_url_title)
self.response_num += 1 # 响应数 +1
def _callback(self, temp): # 参数 temp 在此处没有,但不能去掉
if self.is_running:
self.pool.apply_async(self._run, callback=self._callback)
def run(self):
# 1.拼接url_list
self.get_url_list()
for i in range(3): # 设置并发数为3
self.pool.apply_async(self._run, callback=self._callback)
while True:
time.sleep(0.0001)
if self.response_num >= self.request_num:
self.is_running = False
break
if __name__ == '__main__':
t = time.time()
images_path = "./美女"
if not os.path.exists(images_path):
os.makedirs(images_path)
dl = DownLoad_Imgs(images_path)
dl.run()
print("total cost:", time.time() - t)
啊,这是我的知识盲区了。。