Source code for pywb.indexer.cdxindexer

import os
import sys

# Use ujson if available
try:
    from ujson import dumps as ujson_dumps

    try:
        assert (ujson_dumps('http://example.com/',
                            escape_forward_slashes=False) ==
                '"http://example.com/"')
    except Exception as e:  # pragma: no cover
        sys.stderr.write('ujson w/o forward-slash escaping not available,\
defaulting to regular json\n')
        raise

    def json_encode(obj):
        return ujson_dumps(obj, escape_forward_slashes=False)

except:  # pragma: no cover
    from json import dumps as json_encode

try:  # pragma: no cover
    from collections import OrderedDict
except ImportError:  # pragma: no cover
    from ordereddict import OrderedDict


from argparse import ArgumentParser, RawTextHelpFormatter

from six import StringIO

from pywb.indexer.archiveindexer import DefaultRecordParser
import codecs
import six


#=================================================================
[docs]class BaseCDXWriter(object): # To ensure we do not index metadata mime types # from older WARC specs (Heritrix 1.x) that collide with response records METADATA_NO_INDEX_TYPES = ('text/anvl', ) def __init__(self, out): self.out = codecs.getwriter('utf-8')(out) #self.out = out def __enter__(self): self._write_header() return self
[docs] def write(self, entry, filename): if not entry.get('url') or not entry.get('urlkey'): return if self._is_skipped(entry): return self.write_cdx_line(self.out, entry, filename)
def _is_skipped(self, entry): if entry.record.rec_type == 'warcinfo': return True return entry.record.rec_type == 'metadata' and entry['mime'] in self.METADATA_NO_INDEX_TYPES def __exit__(self, *args): return False
#=================================================================
[docs]class CDXJ(object): def _write_header(self): pass
[docs] def write_cdx_line(self, out, entry, filename): out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') outdict = OrderedDict() for n, v in six.iteritems(entry): if n in ('urlkey', 'timestamp'): continue if n.startswith('_'): continue if not v or v == '-': continue outdict[n] = v outdict['filename'] = filename out.write(json_encode(outdict)) out.write('\n')
#=================================================================
[docs]class CDX09(object): def _write_header(self): self.out.write(' CDX N b a m s k r V g\n')
[docs] def write_cdx_line(self, out, entry, filename): out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') try: out.write(entry['url']) except UnicodeDecodeError: out.write(entry['url'].decode('utf-8')) out.write(' ') out.write(entry['mime']) out.write(' ') out.write(entry['status']) out.write(' ') out.write(entry['digest']) out.write(' - ') out.write(entry['offset']) out.write(' ') out.write(filename) out.write('\n')
#=================================================================
[docs]class CDX11(object): def _write_header(self): self.out.write(' CDX N b a m s k r M S V g\n')
[docs] def write_cdx_line(self, out, entry, filename): out.write(entry['urlkey']) out.write(' ') out.write(entry['timestamp']) out.write(' ') try: out.write(entry['url']) except UnicodeDecodeError: out.write(entry['url'].decode('utf-8')) out.write(' ') out.write(entry['mime']) out.write(' ') out.write(entry['status']) out.write(' ') out.write(entry['digest']) out.write(' - - ') out.write(entry['length']) out.write(' ') out.write(entry['offset']) out.write(' ') out.write(filename) out.write('\n')
#=================================================================
[docs]class SortedCDXWriter(BaseCDXWriter): def __enter__(self): self.sortlist = [] res = super(SortedCDXWriter, self).__enter__() self.actual_out = self.out return res
[docs] def write(self, entry, filename): self.out = StringIO() super(SortedCDXWriter, self).write(entry, filename) line = self.out.getvalue() if line: self.sortlist.append(line)
def __exit__(self, *args): self.sortlist.sort() self.actual_out.write(''.join(self.sortlist)) return False
#================================================================= ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz') #================================================================= def _resolve_rel_path(path, rel_root): path = os.path.relpath(path, rel_root) if os.path.sep != '/': #pragma: no cover path = path.replace(os.path.sep, '/') return path #=================================================================
[docs]def iter_file_or_dir(inputs, recursive=True, rel_root=None): for input_ in inputs: if not os.path.isdir(input_): if not rel_root: filename = os.path.basename(input_) else: filename = _resolve_rel_path(input_, rel_root) yield input_, filename elif not recursive: for filename in os.listdir(input_): if filename.endswith(ALLOWED_EXT): full_path = os.path.join(input_, filename) if rel_root: filename = _resolve_rel_path(full_path, rel_root) yield full_path, filename else: for root, dirs, files in os.walk(input_): for filename in files: if filename.endswith(ALLOWED_EXT): full_path = os.path.join(root, filename) if not rel_root: rel_root = input_ rel_path = _resolve_rel_path(full_path, rel_root) yield full_path, rel_path
#=================================================================
[docs]def remove_ext(filename): for ext in ALLOWED_EXT: if filename.endswith(ext): filename = filename[:-len(ext)] break return filename
#=================================================================
[docs]def cdx_filename(filename): return remove_ext(filename) + '.cdx'
#=================================================================
[docs]def get_cdx_writer_cls(options): if options.get('minimal'): options['cdxj'] = True writer_cls = options.get('writer_cls') if writer_cls: if not options.get('writer_add_mixin'): return writer_cls elif options.get('sort'): writer_cls = SortedCDXWriter else: writer_cls = BaseCDXWriter if options.get('cdxj'): format_mixin = CDXJ elif options.get('cdx09'): format_mixin = CDX09 else: format_mixin = CDX11 class CDXWriter(writer_cls, format_mixin): pass return CDXWriter
#=================================================================
[docs]def write_multi_cdx_index(output, inputs, **options): recurse = options.get('recurse', False) rel_root = options.get('rel_root') # write one cdx per dir if output != '-' and os.path.isdir(output): for fullpath, filename in iter_file_or_dir(inputs, recurse, rel_root): outpath = cdx_filename(filename) outpath = os.path.join(output, outpath) with open(outpath, 'wb') as outfile: with open(fullpath, 'rb') as infile: writer = write_cdx_index(outfile, infile, filename, **options) return writer # write to one cdx file else: if output == '-': if hasattr(sys.stdout, 'buffer'): outfile = sys.stdout.buffer else: outfile = sys.stdout else: outfile = open(output, 'wb') writer_cls = get_cdx_writer_cls(options) record_iter = DefaultRecordParser(**options) with writer_cls(outfile) as writer: for fullpath, filename in iter_file_or_dir(inputs, recurse, rel_root): with open(fullpath, 'rb') as infile: entry_iter = record_iter(infile) for entry in entry_iter: writer.write(entry, filename) return writer
#=================================================================
[docs]def write_cdx_index(outfile, infile, filename, **options): #filename = filename.encode(sys.getfilesystemencoding()) writer_cls = get_cdx_writer_cls(options) with writer_cls(outfile) as writer: entry_iter = DefaultRecordParser(**options)(infile) for entry in entry_iter: writer.write(entry, filename) return writer
#=================================================================
[docs]def main(args=None): description = """ Generate .cdx index files for WARCs and ARCs Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats are supported. """ epilog = """ Some examples: * Create "example.cdx" index from example.warc.gz {0} --output ./cdx/example.cdx ./warcs/example.warc.gz * Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/ {0} --sort --output combined.cdx ./warcs/ * Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/ {0} --sort --output ./cdx/ ./warcs/ """.format(os.path.basename(sys.argv[0])) sort_help = """ Sort the output to each file before writing to create a total ordering """ unsurt_help = """ Convert SURT (Sort-friendly URI Reordering Transform) back to regular urls for the cdx key. Default is to use SURT keys. Not-recommended for new cdx, use only for backwards-compatibility. """ verify_help = """ Verify HTTP protocol (1.0/1.1) status in response records and http verb on request records, ensuring the protocol or verb matches the expected list. Raise an exception on failure. (This was previously the default behavior). """ cdx09_help = """ Use older 9-field cdx format, default is 11-cdx field """ minimal_json_help = """ CDX JSON output, but with minimal fields only, available w/o parsing http record. The fields are: canonicalized url, timestamp, original url, digest, archive offset, archive length and archive filename. mimetype is included to indicate warc/revisit only. This option skips record parsing and will not work with POST append (-p) option """ json_help = """ Output CDX JSON format per line, with url timestamp first, followed by a json dict for all other fields: url timestamp { ... } """ output_help = """ Output file or directory. - If directory, each input file is written to a seperate output file with a .cdx extension - If output is '-', output is written to stdout """ input_help = """ Input file or directory. - If directory, all archive files from that directory are read """ allrecords_help = """ Include All records. currently includes the 'request' records in addition to all response records """ post_append_help = """ For POST requests, append form query to url key. (Only applies to form url encoded posts) """ recurse_dirs_help = """ Recurse through all subdirectories if the input is a directory """ dir_root_help = """ Make CDX filenames relative to specified root directory, instead of current working directory """ parser = ArgumentParser(description=description, epilog=epilog, formatter_class=RawTextHelpFormatter) parser.add_argument('-s', '--sort', action='store_true', help=sort_help) parser.add_argument('-a', '--allrecords', action='store_true', help=allrecords_help) parser.add_argument('-p', '--postappend', action='store_true', help=post_append_help) parser.add_argument('-r', '--recurse', action='store_true', help=recurse_dirs_help) parser.add_argument('-d', '--dir-root', help=dir_root_help) parser.add_argument('-u', '--unsurt', action='store_true', help=unsurt_help) parser.add_argument('-v', '--verify', action='store_true', help=verify_help) group = parser.add_mutually_exclusive_group() group.add_argument('-9', '--cdx09', action='store_true', help=cdx09_help) group.add_argument('-j', '--cdxj', action='store_true', help=json_help) parser.add_argument('-mj', '--minimal-cdxj', action='store_true', help=minimal_json_help) parser.add_argument('-o', '--output', default='-', help=output_help) parser.add_argument('inputs', nargs='+', help=input_help) cmd = parser.parse_args(args=args) write_multi_cdx_index(cmd.output, cmd.inputs, sort=cmd.sort, surt_ordered=not cmd.unsurt, include_all=cmd.allrecords, append_post=cmd.postappend, recurse=cmd.recurse, rel_root=cmd.dir_root, verify_http=cmd.verify, cdx09=cmd.cdx09, cdxj=cmd.cdxj, minimal=cmd.minimal_cdxj)
if __name__ == '__main__': main()