Logo Search packages:      
Sourcecode: webcheck version File versions  Download package

def crawler::Site::_get_robotparser (   self,
  link 
) [private]

Return the proper robots parser for the given url or None if one
cannot be constructed. Robot parsers are cached per scheme and
netloc.

Definition at line 124 of file crawler.py.

00124                                     :
        """Return the proper robots parser for the given url or None if one
        cannot be constructed. Robot parsers are cached per scheme and
        netloc."""
        # only some schemes have a meaningful robots.txt file
        if link.scheme != 'http' and link.scheme != 'https':
            debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme)
            return None
        # split out the key part of the url
        location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
        # try to create a new robotparser if we don't already have one
        if not self._robotparsers.has_key(location):
            import httplib
            debugio.info('  getting robots.txt for %s' % location)
            self._robotparsers[location] = None
            try:
                rp = robotparser.RobotFileParser()
                rp.set_url(urlparse.urlunsplit(
                  (link.scheme, link.netloc, '/robots.txt', '', '') ))
                rp.read()
                self._robotparsers[location] = rp
            except (TypeError, IOError, httplib.HTTPException):
                # ignore any problems setting up robot parser
                pass
        return self._robotparsers[location]

    def _is_yanked(self, link):


Generated by  Doxygen 1.6.0   Back to index