import requests
import os
import re
import time
from multiprocessing import JoinableQueue as Queue # 与线程不一样
from multiprocessing import Process
class DownLoad_Imgs():
"""下载图片"""
def __init__(self, path):
# 主页URL
self.basic_url = 'http://pic.netbian.com'
# 要下载的图片网页
self.html_url = "http://pic.netbian.com/4kmeinv/"
# 请求头
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
# 图片保存位置
self.path = path
self.url_queue = Queue() # 存放url队列
self.info_url_queue = Queue() # 存放图片详情url队列
self.image_url_queue = Queue() # 存放图片URL队列
self.save_queue = Queue() # 存放提取(待保存)内容队列
def get_url_list(self):
"""
拼接URL
:return:
"""
for i in range(1, 21):
# 拼接URL
if i == 1:
url = self.html_url
else:
url = self.html_url + "index_" + str(i) + ".html"
self.url_queue.put(url)
def get_images_info(self):
"""
获取本URL页面所有图片详情页面的URL列表
:param url: 即将下载图片页面的URL
:return: URL列表
"""
while True:
url = self.url_queue.get()
resp = requests.get(url, headers=self.header)
if resp.status_code != 200:
self.url_queue.put(url)
else:
content = resp.text
path = '<a href="(/tupian.+?)" target="_blank">'
path = re.compile(path, re.M)
url_list = path.findall(content)
# view_urls = []
for url in url_list:
u = self.basic_url + url
# view_urls.append(u)
self.info_url_queue.put(u)
# 让队列的计数-1
self.url_queue.task_done()
# return view_urls
def get_images(self):
"""
获取图片URL以及标题
:param view_url: 图片详情页面的URL
:return: 图片URL以及名称
"""
while True:
view_url = self.info_url_queue.get()
path = '<img src="(.+?)" data-pic=".+?title="(.+?)"'
path = re.compile(path)
resp = requests.get(view_url, self.header)
if resp.status_code != 200:
self.info_url_queue.put(view_url)
else:
content = resp.content.decode("gbk")
pic_url_title = path.findall(content)[0]
self.image_url_queue.put(pic_url_title)
# return pic_url_title
self.info_url_queue.task_done()
def download(self):
"""
下载图片
:param pic_url_title: 图片URL及标题
:return:
"""
while True:
pic_url_title = self.image_url_queue.get()
pic_url, pic_title = pic_url_title
# 下载
pic_resp = requests.get(self.basic_url + pic_url, self.header)
if pic_resp.status_code != 200:
self.image_url_queue.put(pic_url_title)
else:
image_path = self.path + os.sep + str(pic_title) + '.jpg'
print(pic_url, pic_title)
with open(image_path, 'wb') as f:
f.write(pic_resp.content)
self.image_url_queue.task_done()
def run(self):
# 进程列表
process_list = []
# 拼接URL
pj_url = Process(target=self.get_url_list)
process_list.append(pj_url)
# 获取图片详情页面URL
for _ in range(2):
info_url = Process(target=self.get_images_info)
process_list.append(info_url)
# 获取高清图片URL及标题
image_url = Process(target=self.get_images)
process_list.append(image_url)
# 下载图片
for _ in range(4):
image_save = Process(target=self.download)
process_list.append(image_save)
for pr in process_list:
pr.daemon = True # 把子进程设置为守护线程
pr.start()
time.sleep(3) # 防止子进程还未开始,主进程就结束
for p in [self.url_queue, self.info_url_queue, self.image_url_queue, self.save_queue]:
p.join() # 让主进程阻塞,等待队列计数为0
if __name__ == '__main__':
t = time.time()
images_path = "./美女"
if not os.path.exists(images_path):
os.makedirs(images_path)
dl = DownLoad_Imgs(images_path)
dl.run()
print("total cost:", time.time() - t)