我不会接受我不要的未来
哪怕是命中注定

普通爬虫,多进程,携程,异步结合多进程-爬虫程序

测试普通爬虫程序

import time
import requests
from lxml import etree

urls = [
    'http://www.shuquge.com/txt/76615/14606964.html',
    'http://www.shuquge.com/txt/76615/14606965.html',
    'http://www.shuquge.com/txt/76615/14606966.html',
    'http://www.shuquge.com/txt/76615/14606967.html',
    'http://www.shuquge.com/txt/76615/14606968.html',
    'http://www.shuquge.com/txt/76615/14606969.html',
    'http://www.shuquge.com/txt/76615/14606970.html',
    'http://www.shuquge.com/txt/76615/14606971.html'
]

def get_title(url,cnt):
    response=requests.get(url)
    html=response.content
    title = etree.HTML(html).xpath('//div[@class="content"]/h1/text()')
    print('第%d个title:%s' % (cnt, ''.join(title)))

if __name__ == '__main__':
    start1 = time.time()
    i = 0
    for url in urls:
        i = i + 1
        start = time.time()
        get_title(url,i)
        print('第%d个title爬取耗时:%.5f秒' % (i,float(time.time() - start)))
    print('爬取总耗时:%.5f秒' % float(time.time()-start1))

时间大概是4秒多

测试基于协程的异步爬虫程序

import time
import aiohttp
import asyncio
from lxml import etree

urls = [
    'http://www.shuquge.com/txt/76615/14606964.html',
    'http://www.shuquge.com/txt/76615/14606965.html',
    'http://www.shuquge.com/txt/76615/14606966.html',
    'http://www.shuquge.com/txt/76615/14606967.html',
    'http://www.shuquge.com/txt/76615/14606968.html',
    'http://www.shuquge.com/txt/76615/14606969.html',
    'http://www.shuquge.com/txt/76615/14606970.html',
    'http://www.shuquge.com/txt/76615/14606971.html',
    'http://www.shuquge.com/txt/76615/14606972.html',
    'http://www.shuquge.com/txt/76615/14606973.html',
    'http://www.shuquge.com/txt/76615/14606974.html',
    'http://www.shuquge.com/txt/76615/14606975.html'
]

tilte = []
sem = asyncio.Semaphore(10)


async def get_title(url):
    with(await sem):
        async with aiohttp.ClientSession() as session:
            async with session.request('GET', url) as resp:
                html = await resp.read()
                title = etree.HTML(html).xpath('//div[@class="content"]/h1/text()')
                print(''.join(title))


def main():
    loop = asyncio.get_event_loop()
    tasks = [get_title(url) for url in urls]
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()


if __name__ == '__main__':
    start = time.time()
    main()  # 调用方
    print('总耗时:%.5f秒' % float(time.time() - start))

测试结果是1秒不到

测试基于多进程的分布式爬虫程序

import multiprocessing
from multiprocessing import Pool
import time
import requests
from lxml import etree

urls = [
    'http://www.shuquge.com/txt/76615/14606964.html',
    'http://www.shuquge.com/txt/76615/14606965.html',
    'http://www.shuquge.com/txt/76615/14606966.html',
    'http://www.shuquge.com/txt/76615/14606967.html',
    'http://www.shuquge.com/txt/76615/14606968.html',
    'http://www.shuquge.com/txt/76615/14606969.html',
    'http://www.shuquge.com/txt/76615/14606970.html',
    'http://www.shuquge.com/txt/76615/14606971.html',
    'http://www.shuquge.com/txt/76615/14606972.html',
    'http://www.shuquge.com/txt/76615/14606973.html',
    'http://www.shuquge.com/txt/76615/14606974.html',
    'http://www.shuquge.com/txt/76615/14606975.html'
]

def get_title(url,cnt):
    response=requests.get(url)
    html=response.content
    title = etree.HTML(html).xpath('//div[@class="content"]/h1/text()')
    print('第%d个title:%s' % (cnt, ''.join(title)))

def main():
    print('当前环境CPU核数是:%d核' % multiprocessing.cpu_count())
    p = Pool(4)  # 进程池
    i = 0
    for url in urls:
        i += 1
        p.apply_async(get_title, args=(url, i))
    p.close()
    p.join()   # 运行完所有子进程才能顺序运行后续程序

if __name__ == '__main__':
    start = time.time()
    main()  # 调用方
    print('总耗时:%.5f秒' % float(time.time()-start))

 

赞(0)
未经允许不得转载:技术搬运工 » 普通爬虫,多进程,携程,异步结合多进程-爬虫程序
分享到: 更多 (0)

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

我们不生产技术 我们只是技术的搬运工