美女壁纸批量下载之多进程篇

import requests
import os
import re
import time

from multiprocessing import JoinableQueue as Queue  # 与线程不一样
from multiprocessing import Process


class DownLoad_Imgs():
    """下载图片"""

    def __init__(self, path):
        # 主页URL
        self.basic_url = 'http://pic.netbian.com'
        # 要下载的图片网页
        self.html_url = "http://pic.netbian.com/4kmeinv/"
        # 请求头
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
        }
        # 图片保存位置
        self.path = path

        self.url_queue = Queue()  # 存放url队列
        self.info_url_queue = Queue()  # 存放图片详情url队列
        self.image_url_queue = Queue()  # 存放图片URL队列
        self.save_queue = Queue()  # 存放提取(待保存)内容队列

    def get_url_list(self):
        """
        拼接URL
        :return:
        """
        for i in range(1, 21):
            # 拼接URL
            if i == 1:
                url = self.html_url
            else:
                url = self.html_url + "index_" + str(i) + ".html"
            self.url_queue.put(url)

    def get_images_info(self):
        """
        获取本URL页面所有图片详情页面的URL列表
        :param url: 即将下载图片页面的URL
        :return: URL列表
        """
        while True:
            url = self.url_queue.get()
            resp = requests.get(url, headers=self.header)
            if resp.status_code != 200:
                self.url_queue.put(url)
            else:
                content = resp.text
                path = '<a href="(/tupian.+?)" target="_blank">'
                path = re.compile(path, re.M)
                url_list = path.findall(content)
                # view_urls = []
                for url in url_list:
                    u = self.basic_url + url
                    # view_urls.append(u)
                    self.info_url_queue.put(u)
            # 让队列的计数-1
            self.url_queue.task_done()
            # return view_urls

    def get_images(self):
        """
        获取图片URL以及标题
        :param view_url: 图片详情页面的URL
        :return: 图片URL以及名称
        """
        while True:
            view_url = self.info_url_queue.get()
            path = '<img src="(.+?)" data-pic=".+?title="(.+?)"'
            path = re.compile(path)
            resp = requests.get(view_url, self.header)
            if resp.status_code != 200:
                self.info_url_queue.put(view_url)
            else:
                content = resp.content.decode("gbk")
                pic_url_title = path.findall(content)[0]
                self.image_url_queue.put(pic_url_title)
                # return pic_url_title
            self.info_url_queue.task_done()

    def download(self):
        """
        下载图片
        :param pic_url_title: 图片URL及标题
        :return:
        """
        while True:
            pic_url_title = self.image_url_queue.get()
            pic_url, pic_title = pic_url_title
            # 下载
            pic_resp = requests.get(self.basic_url + pic_url, self.header)
            if pic_resp.status_code != 200:
                self.image_url_queue.put(pic_url_title)
            else:
                image_path = self.path + os.sep + str(pic_title) + '.jpg'
                print(pic_url, pic_title)
                with open(image_path, 'wb') as f:
                    f.write(pic_resp.content)
            self.image_url_queue.task_done()

    def run(self):
        # 进程列表
        process_list = []
        # 拼接URL
        pj_url = Process(target=self.get_url_list)
        process_list.append(pj_url)
        # 获取图片详情页面URL
        for _ in range(2):
            info_url = Process(target=self.get_images_info)
            process_list.append(info_url)
        # 获取高清图片URL及标题
        image_url = Process(target=self.get_images)
        process_list.append(image_url)
        # 下载图片
        for _ in range(4):
            image_save = Process(target=self.download)
            process_list.append(image_save)

        for pr in process_list:
            pr.daemon = True  # 把子进程设置为守护线程
            pr.start()

        time.sleep(3)  # 防止子进程还未开始,主进程就结束

        for p in [self.url_queue, self.info_url_queue, self.image_url_queue, self.save_queue]:
            p.join()  # 让主进程阻塞,等待队列计数为0


if __name__ == '__main__':
    t = time.time()
    images_path = "./美女"
    if not os.path.exists(images_path):
        os.makedirs(images_path)
    dl = DownLoad_Imgs(images_path)
    dl.run()
    print("total cost:", time.time() - t)