1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
| import asyncio import time import multiprocessing as mp import os import re import aiohttp import aiohttp.client_exceptions import requests import selenium from aiosocksy.connector import ProxyConnector, ProxyClientRequest from fake_useragent import UserAgent from lxml import etree from retrying import retry from selenium import webdriver from selenium.common.exceptions import NoSuchAttributeException from selenium.webdriver.chrome.webdriver import Options from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait
class Config(): def __init__(self): self.SCR_NUM_MAX = 3
class D_crawler(): def __init__(self, author): self.author = author self.base_url = "https://www.deviantart.com/{}/gallery/all".format(self.author) self.img_info = [] self.config = Config() self.mkdir()
def mkdir(self): filelist = os.listdir('.') if self.author not in filelist: os.mkdir(self.author) r = requests.get(self.base_url) if r.status_code == 200: print("{}作者的文件夹已经建立".format(self.author)) else: exit("输入的作者不存在") else: if len(os.listdir(self.author)): exit('文件夹已经存在,并且里面有东西,程序暂停') else: print("文件夹已经存在,但是里面没有东西,程序继续运行")
def init_browser(self): print("浏览器初始化中...") chrome_options = Options() chrome_options.add_argument('--proxy-server=socks5://127.0.0.1:1081') prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) Browser = webdriver.Chrome(options=chrome_options) Browser.get(self.base_url) WebDriverWait(Browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'body')) ) Browser.find_element_by_css_selector('body').click() print("浏览器初始化完成") return Browser
def get_xpath(self, xpath_name): xpath = { 'a_hrefs_xpath': '//*[@class="_2vta_"]', 'img_src_xpath': '//*[@id="root"]/main/div/div[1]/div[1]/div/div[2]/div[1]/img/@src', 'logging_xpath': '//*[@class="_3UKUX"]', 'all_num': '//*[@id="sub-folder-gallery"]/div[1]/div/div/div/div[1]/div/div/div/div/span', 'tail': '//*[@class="_1BvgX"]' } return xpath[xpath_name]
def check_last(self, Browser): try: last_tag = Browser.find_element_by_xpath(self.get_xpath('tail')) return False except selenium.common.exceptions.NoSuchElementException: return True
async def main_process(self): Browser = self.init_browser() await asyncio.sleep(0.1) all_num = str(Browser.find_element_by_xpath(self.get_xpath('all_num')).text) print("{}有{}张图片".format(self.author, all_num)) scr_num = 0 img_info = [] connector = ProxyConnector() all_hrefs = set() unseen = set() seen = set() async with aiohttp.ClientSession(connector=connector, request_class=ProxyClientRequest) as session: while self.check_last(Browser): all_hrefs.update([a_tag.get_attribute('href') for a_tag in Browser.find_elements_by_xpath(self.get_xpath('a_hrefs_xpath'))]) unseen.update(all_hrefs - seen) if len(unseen) != 0: tasks = [loop.create_task(self.get_img_info(session, target)) for target in unseen] Done, Pendding = await asyncio.wait(tasks) for item in Done: if item.result(): img_info.append(item.result()) print("添加的图片信息数量为{}".format(len(img_info)))
try: WebDriverWait(Browser, 10).until( EC.invisibility_of_element_located((By.XPATH, self.get_xpath('logging_xpath'))) ) except selenium.common.exceptions.TimeoutException: Browser.find_element_by_xpath(self.get_xpath('a_hrefs_xpath')).send_keys(Keys.PAGE_UP) Browser.find_element_by_xpath(self.get_xpath('a_hrefs_xpath')).send_keys(Keys.PAGE_UP) await asyncio.sleep(0.1) seen.update(unseen) unseen.clear() Browser.find_element_by_xpath(self.get_xpath('a_hrefs_xpath')).send_keys(Keys.PAGE_DOWN) Browser.quit() return img_info
@retry async def get_img_info(self, session, url): img_name = "".join(re.findall(r'(?<=\/)[^\/]*(?=\-)', url)) + ".jpg" socks = 'socks5://127.0.0.1:1081' r = await session.get(url, proxy=socks) html = await r.text() await asyncio.sleep(0.1) selector = etree.HTML(html) parse_hrefs = "".join(selector.xpath(self.get_xpath('img_src_xpath'))) if parse_hrefs != '': img_info = { 'img_name': img_name, 'img_href': parse_hrefs, 'a_href': url } return img_info else: print(img_name + '地址找不到') return False
@retry def download_img(self, img): res = requests.session() proxies = { 'http': '127.0.0.1:1081', 'https': '127.0.0.1:1081', } headers = {'UserAgent': UserAgent().random} if len(img) != 0: if self.check_img_exists(img['img_name']): filename = self.check_img_exists(img['img_name']) r = res.get(img['img_href'], headers=headers, proxies=proxies) if r.status_code == 200: with open(filename, 'wb') as f: f.write(r.content) print('下载成功') return filename else: print("文件下载失败,返回错误为{}".format(r.status_code)) return None
def check_img_exists(self, filename): filelist = os.listdir(self.author + '/') if filename not in filelist: print('{}图片可以下载'.format(filename)) return self.author + '/' + filename else: print('{}图片已经存在,已经跳过下载'.format(filename)) return False
async def main(loop, authors): craws = [D_crawler(author) for author in authors] tasks = [loop.create_task(craw.main_process()) for craw in craws] Done, Pendding = await asyncio.wait(tasks) img_infos = [item.result() for item in Done] i = 0 while i < len(craws): craw = craws[i] pool = mp.Pool(8) img_info = img_infos[i] download_jobs = [pool.apply_async(craw.download_img, args=(img,)) for img in img_info] print([job.get() for job in download_jobs]) i += 1 print("完毕")
if __name__ == '__main__': start_time = time.time() authors_num = int(input("请输入你想要几位作者\n")) authors = [] for i in range(authors_num): authors.append(input("请输入第%d位作者的名字\n" % int(i + 1))) loop = asyncio.get_event_loop() loop.run_until_complete(main(loop, authors)) print("总共耗时{}".format(time.time() - start_time))
|