# 4.1  加速爬虫: 多进程分布式

![](https://morvanzhou.github.io/static/results/scraping/4-1-1.png)

首页中有很多url,使用多进程同时开始下载这些url,得到这些 url 的HTML以后, 同时开始解析网页内容. 在网页中寻找这个网站还没有爬过的链接. 最终爬完整个莫烦Python网站所有页面

```python
import multiprocessing as mp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re


base_url = 'https://morvanzhou.github.io/'
```

```python
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)             # slightly delay for downloading
    return response.read().decode()


def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    url = soup.find('meta', {'property': "og:url"})['content']
    return title, page_urls, url
```

```python
unseen = set([base_url,])
seen = set()

count, t1 = 1, time.time()

while len(unseen) != 0:                 # still get some url to visit
    if len(seen) > 20:
            break

    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]

    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 53 s
```

```python
if __name__ == '__main__':
    unseen = set([base_url,])
    seen = set()

    pool = mp.Pool(4)                       
    count, t1 = 1, time.time()
    while len(unseen) != 0:                 # still get some url to visit
        if len(seen) > 20:
                break
        print('\nDistributed Crawling...')
        crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
        htmls = [j.get() for j in crawl_jobs]                                       # request connection

        print('\nDistributed Parsing...')
        parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
        results = [j.get() for j in parse_jobs]                                     # parse html

        print('\nAnalysing...')
        seen.update(unseen)         # seen the crawled
        unseen.clear()              # nothing unseen

        for title, page_urls, url in results:
            print(count, title, url)
            count += 1
            unseen.update(page_urls - seen)     # get new url to crawl
    print('Total time: %.1f s' % (time.time()-t1, ))    # 16 s !!!
```


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://baozoulin.gitbook.io/python/jia-su-ni-de-pa-chong/41-jia-su-pa-866b3a-duo-jin-cheng-fen-bu-shi.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
