好用的一个爬虫库
优点是后台调用 chromium ,异步抓取,使用也简单,对一些异步加载数据的网页,很方便
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| # 简单例子
# coding=utf-8 import time import asyncio from pyppeteer import launch from pyquery import PyQuery as pq
urls =["https://v.huya.com/u/146501201/video.html?sort=news&p={}".format(i) for i in range(1,6)]
async def main(): down_list = [] browser = await launch() page = await browser.newPage() with open("/Users/ming/projects/huyaPyDwon/down.txt", 'a') as f: for url in urls: await page.goto(url) await page.waitForSelector('.content-list .statpid') doc = pq(await page.content()) pink_link = "https://v.huya.com" names = [pink_link + item.attr('href') for item in doc('.content-list .statpid').items()] for name in names: f.write(name + '\n') # print('Names:', names) await browser.close()
asyncio.get_event_loop().run_until_complete(main())
|