1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
|
import time from datetime import timedelta
from html.parser import HTMLParser from urllib.parse import urljoin, urldefrag
from tornado import gen, httpclient, ioloop, queues
base_url = 'http://www.tornadoweb.org/en/stable/' concurrency = 10
async def get_links_from_url(url): """Download the page at `url` and parse it for links.
Returned links have had the fragment after `#` removed, and have been made absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes 'http://www.tornadoweb.org/en/stable/gen.html'. """ response = await httpclient.AsyncHTTPClient().fetch(url) print('fetched %s' % url)
html = response.body.decode(errors='ignore') return [urljoin(url, remove_fragment(new_url)) for new_url in get_links(html)]
def remove_fragment(url): pure_url, frag = urldefrag(url) return pure_url
def get_links(html): class URLSeeker(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.urls = []
def handle_starttag(self, tag, attrs): href = dict(attrs).get('href') if href and tag == 'a': self.urls.append(href)
url_seeker = URLSeeker() url_seeker.feed(html) return url_seeker.urls
async def main(): q = queues.Queue() start = time.time() fetching, fetched = set(), set()
async def fetch_url(current_url): if current_url in fetching: return
print('fetching %s' % current_url) fetching.add(current_url) urls = await get_links_from_url(current_url) fetched.add(current_url)
for new_url in urls: if new_url.startswith(base_url): await q.put(new_url)
async def worker(): async for url in q: if url is None: return try: await fetch_url(url) except Exception as e: print('Exception: %s %s' % (e, url)) finally: q.task_done()
await q.put(base_url)
workers = gen.multi([worker() for _ in range(concurrency)]) await q.join(timeout=timedelta(seconds=300)) assert fetching == fetched print('Done in %d seconds, fetched %s URLs.' % ( time.time() - start, len(fetched)))
for _ in range(concurrency): await q.put(None) await workers
if __name__ == '__main__': io_loop = ioloop.IOLoop.current() io_loop.run_sync(main)
|