Source code for pywb.warcserver.resource.pathresolvers

import redis

from warcio.utils import to_native_str
from pywb.utils.binsearch import iter_exact

from pywb.warcserver.index.indexsource import RedisIndexSource

from pywb.utils.loaders import from_file_url
import six

import os
import logging
import glob

"""
The purpose of this module is to 'resolve' a warc/arc filename,
often found in a CDX file, to a full loadable url.

Supported resolvers are: url prefix, path index lookup and redis

make_best_resolver() attempts to guess the resolver method for given uri

"""


#=============================================================================
# PrefixResolver - convert cdx file entry to url with prefix
# if url contains specified string
#=============================================================================
[docs]class PrefixResolver(object): def __init__(self, template): self.template = template def __call__(self, filename, cdx): full_path = self.template if hasattr(cdx, '_formatter') and cdx._formatter: full_path = cdx._formatter.format(full_path) #path = os.path.join(full_path, filename) path = full_path + filename if '*' not in path: return path #res_path = self.resolve_coll(path, cdx.get('source')) coll = cdx.get('source-coll') if coll: return path.replace('*', coll) if '://' in path: return path paths = glob.glob(path) if paths: return paths else: return path
[docs] def resolve_coll(self, path, source): if not source: return if os.path.sep != '/': source = source.replace(os.path.sep, '/') coll = source.split('/', 1)[0] return path.replace('*', coll)
def __repr__(self): return "PrefixResolver('{0}')".format(self.template)
#=============================================================================
[docs]class RedisResolver(RedisIndexSource): def __call__(self, filename, cdx): redis_key = self.redis_key_template params = {} if hasattr(cdx, '_formatter') and cdx._formatter: redis_key = cdx._formatter.format(redis_key) params = cdx._formatter.params res = None if '*' in redis_key: for key in self.scan_keys(redis_key, params): res = self.redis.hget(key, filename) if res: break else: res = self.redis.hget(redis_key, filename) res = to_native_str(res, 'utf-8') return res def __repr__(self): return "RedisResolver('{0}')".format(self.redis_url)
#=================================================================
[docs]class PathIndexResolver(object): def __init__(self, pathindex_file): self.pathindex_file = pathindex_file def __call__(self, filename, cdx): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: yield to_native_str(path, 'utf-8') def __repr__(self): # pragma: no cover return "PathIndexResolver('{0}')".format(self.pathindex_file)
#=================================================================
[docs]class DefaultResolverMixin(object):
[docs] @classmethod def make_best_resolver(cls, path): if hasattr(path, '__call__'): return path if path.startswith('redis://'): return RedisResolver(path) path = from_file_url(path) if os.path.isfile(path): return PathIndexResolver(path) else: return PrefixResolver(path)
[docs] @classmethod def make_resolvers(cls, paths): if isinstance(paths, six.string_types): paths = [paths] elif paths is None: paths = [] return [cls.make_best_resolver(path) for path in paths]