Logo Search packages:      
Sourcecode: webcheck version File versions  Download package

def crawler::Site::crawl (   self,
  serfp = None 
)

Crawl the website based on the urls specified with
add_internal(). If the serialization file pointer
is specified the crawler writes out updated links to
the file while crawling the site.

Definition at line 191 of file crawler.py.

00191                                :
        """Crawl the website based on the urls specified with
        add_internal(). If the serialization file pointer
        is specified the crawler writes out updated links to
        the file while crawling the site."""
        # TODO: have some different scheme to crawl a site (e.g. separate
        #       internal and external queues, threading, etc)
        tocheck = set()
        # add all unfetched site urls
        for link in self.linkMap.values():
            if not link.isyanked and not link.isfetched:
                tocheck.add(link)
        # add all internal urls
        for url in self._internal_urls:
            tocheck.add(self.get_link(url))
        # repeat until we have nothing more to check
        fetchedlinks = 0
        while len(tocheck) > 0:
            debugio.debug('crawler.crawl(): items left to check: %d' % len(tocheck))
            # choose a link from the tocheck list
            link = tocheck.pop()
            # skip link it there is nothing to check
            if link.isyanked or link.isfetched:
                continue
            # fetch the link's contents
            link.fetch()
            # add children to tocheck
            for child in link.children:
                if not child.isyanked and not child.isfetched:
                    tocheck.add(child)
            # add embedded content
            for embed in link.embedded:
                if not embed.isyanked and not embed.isfetched:
                    tocheck.add(embed)
            # serialize all as of yet unserialized links
            fetchedlinks += 1
            # TODO: make this configurable
            if serfp and fetchedlinks >= 5:
                fetchedlinks = 0
                import serialize
                for link in self.linkMap.values():
                    if link._ischanged:
                        serialize.serialize_link(serfp, link)
                        link._ischanged = False
                serfp.flush()
            # sleep between requests if configured
            if config.WAIT_BETWEEN_REQUESTS > 0:
                debugio.debug('crawler.crawl(): sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS)
                time.sleep(config.WAIT_BETWEEN_REQUESTS)
        # serialize remaining changed links
        if serfp:
            import serialize
            for link in self.linkMap.values():
                if link._ischanged:
                    serialize.serialize_link(serfp, link)
                    link._ischanged = False
            serfp.flush()

    def postprocess(self):


Generated by  Doxygen 1.6.0   Back to index