import six
from warcio.recordloader import ArchiveLoadFailed
from warcio.timeutils import iso_date_to_timestamp
from pywb.utils.io import no_except_close
from pywb.utils.wbexception import NotFoundException
from pywb.warcserver.resource.blockrecordloader import BlockArcWarcRecordLoader
# =================================================================
[docs]class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
EMPTY_DIGEST = '3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ'
def __init__(self, path_resolvers, record_loader=None, no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader if record_loader is not None else BlockArcWarcRecordLoader()
self.no_record_parse = no_record_parse
def __call__(self, cdx, failed_files, cdx_loader, *args, **kwargs):
headers_record, payload_record = self.load_headers_and_payload(cdx, failed_files, cdx_loader)
# Default handling logic when loading http status/headers
# special case: set header to payload if old-style revisit
# with missing header
if not headers_record:
headers_record = payload_record
elif headers_record != payload_record:
# close remainder of stream as this record only used for
# (already parsed) headers
no_except_close(headers_record.raw_stream)
# special case: check if headers record is actually empty
# (eg empty revisit), then use headers from revisit
if not headers_record.http_headers.headers:
headers_record = payload_record
if not headers_record or not payload_record:
if headers_record:
no_except_close(headers_record.raw_stream)
if payload_record:
no_except_close(payload_record.raw_stream)
raise ArchiveLoadFailed('Could not load ' + str(cdx))
# ensure status line is valid from here
headers_record.http_headers.validate_statusline('204 No Content')
return (headers_record.http_headers, payload_record.raw_stream)
[docs] def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
"""
Resolve headers and payload for a given capture
In the simple case, headers and payload are in the same record.
In the case of revisit records, the payload and headers may be in
different records.
If the original has already been found, lookup original using
orig. fields in cdx dict.
Otherwise, call _load_different_url_payload() to get cdx index
from a different url to find the original record.
"""
has_curr = (cdx['filename'] != '-')
# has_orig = (cdx.get('orig.filename', '-') != '-')
orig_f = cdx.get('orig.filename')
has_orig = orig_f and orig_f != '-'
# load headers record from cdx['filename'] unless it is '-' (rare)
headers_record = None
payload_record = None
if has_curr:
headers_record = self._resolve_path_load(cdx, False, failed_files)
# two index lookups
# Case 1: if mimetype is still warc/revisit
if cdx.get('mime') == 'warc/revisit' and headers_record:
if headers_record.http_headers:
status = headers_record.http_headers.get_statuscode()
# optimization: if redirect, don't load payload record, as it'll be ignored by browser
# always replay zero-length payload
if status and status.startswith('3'):
headers_record.http_headers.replace_header('Content-Length', '0')
return headers_record, headers_record
payload_record = self._load_different_url_payload(cdx,
headers_record,
failed_files,
cdx_loader)
# single lookup cases
# case 2: non-revisit
elif (has_curr and not has_orig):
payload_record = headers_record
# case 3: identical url revisit, load payload from orig.filename
elif (has_orig):
payload_record = self._resolve_path_load(cdx, True, failed_files)
return headers_record, payload_record
def _resolve_path_load(self, cdx, is_original, failed_files):
"""
Load specific record based on filename, offset and length
fields in the cdx.
If original=True, use the orig.* fields for the cdx
Resolve the filename to full path using specified path resolvers
If failed_files list provided, keep track of failed resolve attempts
"""
if is_original:
(filename, offset, length) = (cdx['orig.filename'],
cdx['orig.offset'],
cdx['orig.length'])
else:
(filename, offset, length) = (cdx['filename'],
cdx['offset'],
cdx.get('length', '-'))
# optimization: if same file already failed this request,
# don't try again
if failed_files is not None and filename in failed_files:
raise ArchiveLoadFailed('Skipping Already Failed: ' + filename)
any_found = False
last_exc = None
last_traceback = None
for resolver in self.path_resolvers:
possible_paths = resolver(filename, cdx)
if not possible_paths:
continue
if isinstance(possible_paths, six.string_types):
possible_paths = [possible_paths]
for path in possible_paths:
any_found = True
try:
return (self.record_loader.
load(path, offset, length,
no_record_parse=self.no_record_parse))
except Exception as ue:
last_exc = ue
import sys
last_traceback = sys.exc_info()[2]
# Unsuccessful if reached here
if failed_files is not None:
failed_files.append(filename)
if last_exc:
# msg = str(last_exc.__class__.__name__)
msg = str(last_exc)
else:
msg = 'Archive File Not Found'
# raise ArchiveLoadFailed(msg, filename), None, last_traceback
six.reraise(ArchiveLoadFailed, ArchiveLoadFailed(filename + ': ' + msg), last_traceback)
def _load_different_url_payload(self, cdx, headers_record,
failed_files, cdx_loader):
"""
Handle the case where a duplicate of a capture with same digest
exists at a different url.
If a cdx_server is provided, a query is made for matching
url, timestamp and digest.
Raise exception if no matches found.
"""
digest = cdx.get('digest', '-')
# if the digest is the empty record digest, don't attempt to look up the payload record!
# the payload is simply empty, so use empty payload of existing record
if digest == self.EMPTY_DIGEST:
return headers_record
ref_target_uri = (headers_record.rec_headers.
get_header('WARC-Refers-To-Target-URI'))
target_uri = headers_record.rec_headers.get_header('WARC-Target-URI')
# if no target uri, no way to find the original
if not ref_target_uri:
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
ref_target_date = (headers_record.rec_headers.
get_header('WARC-Refers-To-Date'))
if not ref_target_date:
ref_target_date = cdx['timestamp']
else:
ref_target_date = iso_date_to_timestamp(ref_target_date)
try:
orig_cdx_lines = self.load_cdx_for_dupe(ref_target_uri,
ref_target_date,
digest,
cdx_loader)
except NotFoundException:
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
for orig_cdx in orig_cdx_lines:
try:
payload_record = self._resolve_path_load(orig_cdx, False,
failed_files)
return payload_record
except ArchiveLoadFailed as e:
pass
raise ArchiveLoadFailed(self.MISSING_REVISIT_MSG)
[docs] def load_cdx_for_dupe(self, url, timestamp, digest, cdx_loader):
"""
If a cdx_server is available, return response from server,
otherwise empty list
"""
if not cdx_loader:
return iter([])
filters = []
filters.append('!mime:warc/revisit')
if digest and digest != '-':
filters.append('digest:' + digest)
params = dict(url=url,
closest=timestamp,
filter=filters)
return cdx_loader(params)