开发者

python threadpool problem (wait for something)

I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happend?

from Queue import Queue
from threading import Thread

import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread

visited = set()
queue = Queue()

class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()

    def run(self):
        while True:
            func, args, kargs = self.tasks.get()
            print "startcall in thread",self
            print args
            try: func(*args, **kargs)
            except Exception, e: print e
            print "stopcall in thread",self
            self.tasks.task_done()

class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        self.tasks = Queue(num_threads)
        for _ in range(num_threads): Worker(self.tasks)

    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()


def process(pool,host,url):

    try:
        print "get url",url
        #content = urlopen(url).read().decode(charset)
        content = urlopen(url).read()
    except UnicodeDecodeError:
        return

    for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
        #print "link",link
        try:
            href = link['href']
        except KeyError:
            continu开发者_StackOverflow中文版e


        if not href.startswith('http://'):
            href = 'http://%s%s' % (host, href)
        if not href.startswith('http://%s%s' % (host, '/')):
            continue



        if href not in visited:
            visited.add(href)
            pool.add_task(process,pool,host,href)
            print href




def start(host,charset):

    pool = ThreadPool(7)
    pool.add_task(process,pool,host,'http://%s/' % (host))
    pool.wait_completion()

start('simplesite.com','utf8') 


The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.

You could try to :
1) insert

if not func: break  

after task.get(...) in run.

2) append

pool.add_task(None, None, None)  

at the end of process.

This is a way for process to notify the pool that he has no more task to process.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜