Logo Search packages:      
Sourcecode: webcheck version File versions  Download package

def crawler::Site::_is_internal (   self,
  link 
) [private]

Check whether the specified url is external or internal.
This uses the urls marked with add_internal() and the regular
expressions passed with add_external_re().

Definition at line 96 of file crawler.py.

00096                                 :
        """Check whether the specified url is external or internal.
        This uses the urls marked with add_internal() and the regular
        expressions passed with add_external_re()."""
        # check if it is internal through the regexps
        for regexp in self._internal_res.values():
            if regexp.search(link.url) is not None:
                return True
        res = False
        # check that the url starts with an internal url
        if config.BASE_URLS_ONLY:
            # the url must start with one of the _internal_urls
            for i in self._internal_urls:
                res |= (i==link.url[:len(i)])
        else:
            # the netloc must match a netloc of an _internal_url
            for i in self._internal_urls:
                res |= (urlparse.urlsplit(i)[1]==link.netloc)
        # if it is not internal now, it never will be
        if not res:
            return False
        # check if it is external through the regexps
        for x in self._external_res.values():
            # if the url matches it is external and we can stop
            if x.search(link.url) is not None:
                return False
        return True

    def _get_robotparser(self, link):


Generated by  Doxygen 1.6.0   Back to index