Source code for pywb.warcserver.handlers

from pywb.utils.wbexception import BadRequestException, WbException, AccessException
from pywb.utils.wbexception import NotFoundException
from pywb.utils.memento import MementoUtils

from warcio.recordloader import ArchiveLoadFailed

from pywb.warcserver.index.cdxobject import CDXException
from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
from pywb.warcserver.resource.responseloader import  WARCPathLoader, LiveWebLoader, VideoLoader

import six
import logging
import traceback


logger = logging.getLogger('warcserver')


#=============================================================================
[docs]def to_cdxj(cdx_iter, fields, params): content_type = 'text/x-cdxj' return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
[docs]def to_json(cdx_iter, fields, params): content_type = 'text/x-ndjson' return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
[docs]def to_text(cdx_iter, fields, params): content_type = 'text/plain' return content_type, (cdx.to_text(fields) for cdx in cdx_iter)
#=============================================================================
[docs]class IndexHandler(object): OUTPUTS = { 'cdxj': to_cdxj, 'json': to_json, 'text': to_text, 'link': to_link, } DEF_OUTPUT = 'cdxj' def __init__(self, index_source, opts=None, *args, **kwargs): self.index_source = index_source self.opts = opts or {} self.fuzzy = FuzzyMatcher(kwargs.get('rules_file')) self.access_checker = kwargs.get('access_checker')
[docs] def get_supported_modes(self): return dict(modes=['list_sources', 'index'])
def _load_index_source(self, params): url = params.get('url') if not url: errs = dict(last_exc=BadRequestException('The "url" param is required')) return None, errs input_req = params.get('_input_req') if input_req: params['alt_url'] = input_req.include_method_query(url) cdx_iter = self.fuzzy(self.index_source, params) acl_user = params['_input_req'].env.get("HTTP_X_PYWB_ACL_USER") if self.access_checker: cdx_iter = self.access_checker(cdx_iter, acl_user) return cdx_iter def __call__(self, params): mode = params.get('mode', 'index') if mode == 'list_sources': return {}, self.index_source.get_source_list(params), {} if mode != 'index': return {}, self.get_supported_modes(), {} output = params.get('output', self.DEF_OUTPUT) fields = params.get('fields') if not fields: fields = params.get('fl') if fields and isinstance(fields, str): fields = fields.split(',') handler = self.OUTPUTS.get(output) if not handler: errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output))) return None, None, errs cdx_iter = None try: cdx_iter, errs = self._load_index_source(params) except BadRequestException as e: errs = dict(last_exc=e) if not cdx_iter: return None, None, errs content_type, res = handler(cdx_iter, fields, params) out_headers = {'Content-Type': content_type} first_line = None try: # raise exceptions early so that they can be handled properly first_line = next(res) except StopIteration: pass except CDXException as e: errs = dict(last_exc=e) return None, None, errs def check_str(first_line, lines): if first_line is not None: if isinstance(first_line, six.text_type): first_line = first_line.encode('utf-8') yield first_line for line in lines: if isinstance(line, six.text_type): line = line.encode('utf-8') yield line return out_headers, check_str(first_line, res), errs
#=============================================================================
[docs]class ResourceHandler(IndexHandler): def __init__(self, index_source, resource_loaders, **kwargs): super(ResourceHandler, self).__init__(index_source, **kwargs) self.resource_loaders = resource_loaders
[docs] def get_supported_modes(self): res = super(ResourceHandler, self).get_supported_modes() res['modes'].append('resource') return res
def __call__(self, params): if params.get('mode', 'resource') != 'resource': return super(ResourceHandler, self).__call__(params) cdx_iter, errs = self._load_index_source(params) if not cdx_iter: return None, None, errs last_exc = None for cdx in cdx_iter: if cdx.get('access', 'allow') != 'allow': raise AccessException(msg={'access': cdx['access'], 'access_status': cdx.get('access_status', 451)}, url=cdx['url']) for loader in self.resource_loaders: try: out_headers, resp = loader(cdx, params) if resp is not None: return out_headers, resp, errs except (WbException, ArchiveLoadFailed) as e: last_exc = e if logger.isEnabledFor(logging.DEBUG): traceback.print_exc() errs[str(loader)] = str(e) if last_exc: errs['last_exc'] = last_exc return None, None, errs
#=============================================================================
[docs]class DefaultResourceHandler(ResourceHandler): def __init__(self, index_source, warc_paths='', forward_proxy_prefix='', **kwargs): loaders = [WARCPathLoader(warc_paths, index_source), LiveWebLoader(forward_proxy_prefix), VideoLoader() ] super(DefaultResourceHandler, self).__init__(index_source, loaders, **kwargs)
#=============================================================================
[docs]class HandlerSeq(object): def __init__(self, handlers): self.handlers = handlers
[docs] def get_supported_modes(self): if self.handlers: return self.handlers[0].get_supported_modes() else: return {}
def __call__(self, params): all_errs = {} for handler in self.handlers: out_headers, res, errs = handler(params) all_errs.update(errs) if res is not None: return out_headers, res, all_errs return None, None, all_errs