百行代码实现一个分布式爬虫
仅需三步
- 首先开发一个单机爬虫,需要set对url去重,List的pop与append实现任务队列
- 然后将set与List改用为Redis的set与List
- 最后使用多进程或者在多台服务器上跑起来即可
示例代码
# -*- coding: utf-8 -*-
# Author: 桑葚ICE
# Email: 152516cc@gmail.com
# Blog: iicey.github.io
# JueJin: juejin.im/user/5c64dce8e51d45013c40742c
import asyncio
import logging
import os
import aiohttp
import aioredis
from aiomultiprocess import Pool
from lxml import etree
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(os.path.split(os.path.basename(__file__))[0])
user = 'root'
password = '123546'
host = '127.0.0.1'
port = 6379
class MoreAuthor:
def __init__(self, conn, redis):
self.conn = conn
self.redis = redis
async def req_res(self, session, url: str, result_type: str = 'text'):
if result_type not in {'text', 'bytes', 'json'}:
raise TypeError(f"The result_type cannot be {result_type}. Use only: text, bytes, json!")
if await self.redis.sismember('already_crawl_urls', url):
logger.debug(f"重复的URL: {url}")
return
else:
self.redis.sadd('already_crawl_urls', url)
async with session.get(url, verify_ssl=False) as result:
if result_type == 'text':
return await result.text()
if result_type == 'bytes':
return await result.read()
if result_type == 'json':
return await result.json()
async def product_and_consumer(self, session, main_url):
if 'follow' not in main_url:
main_url += '/follow?condition=0&p=1'
main_result = await self.req_res(session, main_url)
if main_result:
logger.info(f"auth_main: {main_url}")
author_url_list = await self.parse_more_author(session, main_result)
for author_url in author_url_list:
author_url = str(author_url)
if not await self.redis.sismember('already_crawl_urls', author_url):
logger.info(f"新增作者URl:{author_url}")
await self.redis.sadd('already_crawl_urls', author_url)
await self.redis.rpush('author_url_queue', author_url)
async def parse_more_author(self, session, main_result):
tree = etree.HTML(main_result)
author_url_list = list(tree.xpath('//*[@class="author-info-title"]/a/@href'))
page_url_list = tree.xpath('//*[@id="laypage_0"]/a/@href')
for page_url in page_url_list:
if page_url[0] == '/':
page_url = ''.join(['https://www.zcool.com.cn', page_url])
page_result = await self.req_res(session, str(page_url))
if page_result:
# logger.info(f"author_page: {page_url}")
next_author_url_list = await self.parse_more_author(session, page_result)
author_url_list.extend(next_author_url_list)
return author_url_list
async def author_run(self):
start_url = 'https://morncolour.zcool.com.cn/follow?condition=0&p=1'
sleep_count = 0
async with aiohttp.ClientSession() as session:
if not await self.redis.sismember('already_crawl_urls', start_url):
await asyncio.create_task(self.product_and_consumer(session, start_url))
while 1:
if sleep_count >= 60:
break
if await self.redis.llen('author_url_queue') == 0:
sleep_count += 1
await asyncio.sleep(1)
continue
sleep_count = 0
task = asyncio.create_task(self.product_and_consumer
(session, await self.redis.lpop('author_url_queue')))
await task
async def run(num):
logger.info(f"这是第{num}个进程任务")
conn = await aioredis.create_pool(f'redis://{user}:{password}@{host}:{port}', encoding='utf-8')
redis = aioredis.Redis(pool_or_conn=conn)
m = MoreAuthor(conn, redis)
await m.author_run()
conn.close()
await conn.wait_closed()
async def main():
async with Pool() as pool:
result = await pool.map(run, range(4))
logger.info(result)
if __name__ == '__main__':
asyncio.run(main())
项目地址
github.com/iicey/zcool