Source code for pywb.indexer.cdxindexer

import os
import sys

# Use ujson if available
try:
    from ujson import dumps as ujson_dumps

    try:
        assert (ujson_dumps('http://example.com/',
                            escape_forward_slashes=False) ==
                '"http://example.com/"')
    except Exception as e:  # pragma: no cover
        sys.stderr.write('ujson w/o forward-slash escaping not available,\
defaulting to regular json\n')
        raise

    def json_encode(obj):
        return ujson_dumps(obj, escape_forward_slashes=False)

except:  # pragma: no cover
    from json import dumps as json_encode

try:  # pragma: no cover
    from collections import OrderedDict
except ImportError:  # pragma: no cover
    from ordereddict import OrderedDict


from argparse import ArgumentParser, RawTextHelpFormatter

from six import StringIO

from pywb.indexer.archiveindexer import DefaultRecordParser
import codecs
import six


#=================================================================
[docs]class BaseCDXWriter(object):
    # To ensure we do not index metadata mime types
    # from older WARC specs (Heritrix 1.x) that collide with response records
    METADATA_NO_INDEX_TYPES = ('text/anvl', )

    def __init__(self, out):
        self.out = codecs.getwriter('utf-8')(out)
        #self.out = out

    def __enter__(self):
        self._write_header()
        return self

[docs]    def write(self, entry, filename):
        if not entry.get('url') or not entry.get('urlkey'):
            return

        if self._is_skipped(entry):
            return

        self.write_cdx_line(self.out, entry, filename)

    def _is_skipped(self, entry):
        if entry.record.rec_type == 'warcinfo':
            return True
        return entry.record.rec_type == 'metadata' and entry['mime'] in self.METADATA_NO_INDEX_TYPES

    def __exit__(self, *args):
        return False


#=================================================================
[docs]class CDXJ(object):
    def _write_header(self):
        pass

[docs]    def write_cdx_line(self, out, entry, filename):
        out.write(entry['urlkey'])
        out.write(' ')
        out.write(entry['timestamp'])
        out.write(' ')

        outdict = OrderedDict()

        for n, v in six.iteritems(entry):
            if n in ('urlkey', 'timestamp'):
                continue

            if n.startswith('_'):
                continue

            if not v or v == '-':
                continue

            outdict[n] = v

        outdict['filename'] = filename
        out.write(json_encode(outdict))
        out.write('\n')


#=================================================================
[docs]class CDX09(object):
    def _write_header(self):
        self.out.write(' CDX N b a m s k r V g\n')

[docs]    def write_cdx_line(self, out, entry, filename):
        out.write(entry['urlkey'])
        out.write(' ')
        out.write(entry['timestamp'])
        out.write(' ')
        try:
            out.write(entry['url'])
        except UnicodeDecodeError:
            out.write(entry['url'].decode('utf-8'))
        out.write(' ')
        out.write(entry['mime'])
        out.write(' ')
        out.write(entry['status'])
        out.write(' ')
        out.write(entry['digest'])
        out.write(' - ')
        out.write(entry['offset'])
        out.write(' ')
        out.write(filename)
        out.write('\n')


#=================================================================
[docs]class CDX11(object):
    def _write_header(self):
        self.out.write(' CDX N b a m s k r M S V g\n')

[docs]    def write_cdx_line(self, out, entry, filename):
        out.write(entry['urlkey'])
        out.write(' ')
        out.write(entry['timestamp'])
        out.write(' ')
        try:
            out.write(entry['url'])
        except UnicodeDecodeError:
            out.write(entry['url'].decode('utf-8'))
        out.write(' ')
        out.write(entry['mime'])
        out.write(' ')
        out.write(entry['status'])
        out.write(' ')
        out.write(entry['digest'])
        out.write(' - - ')
        out.write(entry['length'])
        out.write(' ')
        out.write(entry['offset'])
        out.write(' ')
        out.write(filename)
        out.write('\n')


#=================================================================
[docs]class SortedCDXWriter(BaseCDXWriter):
    def __enter__(self):
        self.sortlist = []
        res = super(SortedCDXWriter, self).__enter__()
        self.actual_out = self.out
        return res

[docs]    def write(self, entry, filename):
        self.out = StringIO()
        super(SortedCDXWriter, self).write(entry, filename)
        line = self.out.getvalue()
        if line:
            self.sortlist.append(line)

    def __exit__(self, *args):
        self.sortlist.sort()
        self.actual_out.write(''.join(self.sortlist))
        return False


#=================================================================
ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz')


#=================================================================
def _resolve_rel_path(path, rel_root):
    path = os.path.relpath(path, rel_root)
    if os.path.sep != '/':  #pragma: no cover
        path = path.replace(os.path.sep, '/')
    return path


#=================================================================
[docs]def iter_file_or_dir(inputs, recursive=True, rel_root=None):
    for input_ in inputs:
        if not os.path.isdir(input_):
            if not rel_root:
                filename = os.path.basename(input_)
            else:
                filename = _resolve_rel_path(input_, rel_root)

            yield input_, filename

        elif not recursive:
            for filename in os.listdir(input_):
                if filename.endswith(ALLOWED_EXT):
                    full_path = os.path.join(input_, filename)
                    if rel_root:
                        filename = _resolve_rel_path(full_path, rel_root)
                    yield full_path, filename

        else:
            for root, dirs, files in os.walk(input_):
                for filename in files:
                    if filename.endswith(ALLOWED_EXT):
                        full_path = os.path.join(root, filename)
                        if not rel_root:
                            rel_root = input_
                        rel_path = _resolve_rel_path(full_path, rel_root)
                        yield full_path, rel_path


#=================================================================
[docs]def remove_ext(filename):
    for ext in ALLOWED_EXT:
        if filename.endswith(ext):
            filename = filename[:-len(ext)]
            break

    return filename


#=================================================================
[docs]def cdx_filename(filename):
    return remove_ext(filename) + '.cdx'


#=================================================================
[docs]def get_cdx_writer_cls(options):
    if options.get('minimal'):
        options['cdxj'] = True

    writer_cls = options.get('writer_cls')
    if writer_cls:
        if not options.get('writer_add_mixin'):
            return writer_cls
    elif options.get('sort'):
        writer_cls = SortedCDXWriter
    else:
        writer_cls = BaseCDXWriter

    if options.get('cdxj'):
        format_mixin = CDXJ
    elif options.get('cdx09'):
        format_mixin = CDX09
    else:
        format_mixin = CDX11

    class CDXWriter(writer_cls, format_mixin):
        pass

    return CDXWriter


#=================================================================
[docs]def write_multi_cdx_index(output, inputs, **options):
    recurse = options.get('recurse', False)
    rel_root = options.get('rel_root')

    # write one cdx per dir
    if output != '-' and os.path.isdir(output):
        for fullpath, filename in iter_file_or_dir(inputs,
                                                   recurse,
                                                   rel_root):
            outpath = cdx_filename(filename)
            outpath = os.path.join(output, outpath)

            with open(outpath, 'wb') as outfile:
                with open(fullpath, 'rb') as infile:
                    writer = write_cdx_index(outfile, infile, filename,
                                             **options)

        return writer

    # write to one cdx file
    else:
        if output == '-':
            if hasattr(sys.stdout, 'buffer'):
                outfile = sys.stdout.buffer
            else:
                outfile = sys.stdout
        else:
            outfile = open(output, 'wb')

        writer_cls = get_cdx_writer_cls(options)
        record_iter = DefaultRecordParser(**options)

        with writer_cls(outfile) as writer:
            for fullpath, filename in iter_file_or_dir(inputs,
                                                       recurse,
                                                       rel_root):
                with open(fullpath, 'rb') as infile:
                    entry_iter = record_iter(infile)

                    for entry in entry_iter:
                        writer.write(entry, filename)

        return writer


#=================================================================
[docs]def write_cdx_index(outfile, infile, filename, **options):
    #filename = filename.encode(sys.getfilesystemencoding())

    writer_cls = get_cdx_writer_cls(options)

    with writer_cls(outfile) as writer:
        entry_iter = DefaultRecordParser(**options)(infile)

        for entry in entry_iter:
            writer.write(entry, filename)

    return writer


#=================================================================
[docs]def main(args=None):
    description = """
Generate .cdx index files for WARCs and ARCs
Compressed (.warc.gz / .arc.gz) or uncompressed (.warc / .arc) formats
are supported.
"""

    epilog = """
Some examples:

* Create "example.cdx" index from example.warc.gz
{0} --output ./cdx/example.cdx ./warcs/example.warc.gz

* Create "combined.cdx", a combined, sorted index of all warcs in ./warcs/
{0} --sort --output combined.cdx ./warcs/

* Create a sorted cdx per file in ./cdx/ for each archive file in ./warcs/
{0} --sort --output ./cdx/ ./warcs/
""".format(os.path.basename(sys.argv[0]))

    sort_help = """
Sort the output to each file before writing to create a total ordering
"""

    unsurt_help = """
Convert SURT (Sort-friendly URI Reordering Transform) back to regular
urls for the cdx key. Default is to use SURT keys.
Not-recommended for new cdx, use only for backwards-compatibility.
"""

    verify_help = """
Verify HTTP protocol (1.0/1.1) status in response records and http verb
on request records, ensuring the protocol or verb matches the expected list.
Raise an exception on failure. (This was previously the default behavior).
"""

    cdx09_help = """
Use older 9-field cdx format, default is 11-cdx field
"""
    minimal_json_help = """
CDX JSON output, but with minimal fields only, available  w/o parsing
http record. The fields are: canonicalized url, timestamp,
original url, digest, archive offset, archive length
and archive filename. mimetype is included to indicate warc/revisit only.

This option skips record parsing and will not work with
POST append (-p) option
"""

    json_help = """
Output CDX JSON format per line, with url timestamp first,
followed by a json dict for all other fields:
url timestamp { ... }
"""

    output_help = """
Output file or directory.
- If directory, each input file is written to a seperate output file
  with a .cdx extension
- If output is '-', output is written to stdout
"""

    input_help = """
Input file or directory.
- If directory, all archive files from that directory are read
"""

    allrecords_help = """
Include All records.
currently includes the 'request' records in addition to all
response records
"""

    post_append_help = """
For POST requests, append form query to url key.
(Only applies to form url encoded posts)
"""

    recurse_dirs_help = """
Recurse through all subdirectories if the input is a directory
"""

    dir_root_help = """
Make CDX filenames relative to specified root directory,
instead of current working directory
"""

    parser = ArgumentParser(description=description,
                            epilog=epilog,
                            formatter_class=RawTextHelpFormatter)

    parser.add_argument('-s', '--sort',
                        action='store_true',
                        help=sort_help)

    parser.add_argument('-a', '--allrecords',
                        action='store_true',
                        help=allrecords_help)

    parser.add_argument('-p', '--postappend',
                        action='store_true',
                        help=post_append_help)

    parser.add_argument('-r', '--recurse',
                        action='store_true',
                        help=recurse_dirs_help)

    parser.add_argument('-d', '--dir-root',
                        help=dir_root_help)

    parser.add_argument('-u', '--unsurt',
                        action='store_true',
                        help=unsurt_help)

    parser.add_argument('-v', '--verify',
                        action='store_true',
                        help=verify_help)

    group = parser.add_mutually_exclusive_group()
    group.add_argument('-9', '--cdx09',
                        action='store_true',
                        help=cdx09_help)

    group.add_argument('-j', '--cdxj',
                        action='store_true',
                        help=json_help)

    parser.add_argument('-mj', '--minimal-cdxj',
                        action='store_true',
                        help=minimal_json_help)

    parser.add_argument('-o', '--output',
                        default='-', help=output_help)

    parser.add_argument('inputs', nargs='+', help=input_help)

    cmd = parser.parse_args(args=args)

    write_multi_cdx_index(cmd.output, cmd.inputs,
                          sort=cmd.sort,
                          surt_ordered=not cmd.unsurt,
                          include_all=cmd.allrecords,
                          append_post=cmd.postappend,
                          recurse=cmd.recurse,
                          rel_root=cmd.dir_root,
                          verify_http=cmd.verify,
                          cdx09=cmd.cdx09,
                          cdxj=cmd.cdxj,
                          minimal=cmd.minimal_cdxj)


if __name__ == '__main__':
    main()