Source code for pywb.utils.canonicalize

""" Standard url-canonicalzation, surt and non-surt
"""

import surt
import six.moves.urllib.parse as urlparse

from pywb.utils.wbexception import BadRequestException


#=================================================================
[docs]class UrlCanonicalizer(object): def __init__(self, surt_ordered=True): self.surt_ordered = surt_ordered def __call__(self, url): return canonicalize(url, self.surt_ordered)
#=================================================================
[docs]class UrlCanonicalizeException(BadRequestException): pass
#=================================================================
[docs]def canonicalize(url, surt_ordered=True): """ Canonicalize url and convert to surt If not in surt ordered mode, convert back to url form as surt conversion is currently part of canonicalization >>> canonicalize('http://example.com/path/file.html', surt_ordered=True) 'com,example)/path/file.html' >>> canonicalize('http://example.com/path/file.html', surt_ordered=False) 'example.com/path/file.html' >>> canonicalize('urn:some:id') 'urn:some:id' """ try: key = surt.surt(url) except Exception as e: #pragma: no cover # doesn't happen with surt from 0.3b # urn is already canonical, so just use as-is if url.startswith('urn:'): return url raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: key = unsurt(key) return key
#=================================================================
[docs]def unsurt(surt): """ # Simple surt >>> unsurt('com,example)/') 'example.com/' # Broken surt >>> unsurt('com,example)') 'com,example)' # Long surt >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\ index.html?a=b?c=)/') 'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/' """ try: index = surt.index(')/') parts = surt[0:index].split(',') parts.reverse() host = '.'.join(parts) host += surt[index + 1:] return host except ValueError: # May not be a valid surt return surt
#=================================================================
[docs]def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): """ Canonicalize a url (either with custom canonicalizer or standard canonicalizer with or without surt) Then, compute a start and end search url search range for a given match type. Support match types: * exact * prefix * host * domain (only available when for surt ordering) Examples below: # surt ranges >>> calc_search_range('http://example.com/path/file.html', 'exact') ('com,example)/path/file.html', 'com,example)/path/file.html!') >>> calc_search_range('http://example.com/path/file.html', 'prefix') ('com,example)/path/file.html', 'com,example)/path/file.htmm') # slash and ? >>> calc_search_range('http://example.com/path/', 'prefix') ('com,example)/path/', 'com,example)/path0') >>> calc_search_range('http://example.com/path?', 'prefix') ('com,example)/path?', 'com,example)/path@') >>> calc_search_range('http://example.com/path/?', 'prefix') ('com,example)/path?', 'com,example)/path@') >>> calc_search_range('http://example.com/path/file.html', 'host') ('com,example)/', 'com,example*') >>> calc_search_range('http://example.com/path/file.html', 'domain') ('com,example)/', 'com,example-') special case for tld domain range >>> calc_search_range('com', 'domain') ('com,', 'com-') # non-surt ranges >>> calc_search_range('http://example.com/path/file.html', 'exact', False) ('example.com/path/file.html', 'example.com/path/file.html!') >>> calc_search_range('http://example.com/path/file.html', 'prefix', False) ('example.com/path/file.html', 'example.com/path/file.htmm') >>> calc_search_range('http://example.com/path/file.html', 'host', False) ('example.com/', 'example.com0') # errors: domain range not supported >>> calc_search_range('http://example.com/path/file.html', 'domain', False) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): UrlCanonicalizeException: matchType=domain unsupported for non-surt >>> calc_search_range('http://example.com/path/file.html', 'blah', False) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): UrlCanonicalizeException: Invalid match_type: blah """ def inc_last_char(x): return x[0:-1] + chr(ord(x[-1]) + 1) if not url_canon: # make new canon url_canon = UrlCanonicalizer(surt_ordered) else: # ensure surt order matches url_canon surt_ordered = url_canon.surt_ordered start_key = url_canon(url) if match_type == 'exact': end_key = start_key + '!' elif match_type == 'prefix': # add trailing slash if url has it if url.endswith('/') and not start_key.endswith('/'): start_key += '/' if url.endswith('?') and not start_key.endswith('?'): start_key += '?' end_key = inc_last_char(start_key) elif match_type == 'host': if surt_ordered: host = start_key.split(')/')[0] start_key = host + ')/' end_key = host + '*' else: host = urlparse.urlsplit(url).netloc start_key = host + '/' end_key = host + '0' elif match_type == 'domain': if not surt_ordered: msg = 'matchType=domain unsupported for non-surt' raise UrlCanonicalizeException(msg) host = start_key.split(')/')[0] # if tld, use com, as start_key # otherwise, stick with com,example)/ if ',' not in host: start_key = host + ',' else: start_key = host + ')/' end_key = host + '-' else: raise UrlCanonicalizeException('Invalid match_type: ' + match_type) return (start_key, end_key)
if __name__ == "__main__": import doctest doctest.testmod()