Source code for pywb.manager.migrate

from pywb.utils.canonicalize import canonicalize
from pywb.warcserver.index.cdxobject import CDXObject, URLKEY, ORIGINAL
from pywb.indexer.cdxindexer import CDXJ

import os
import shutil


#=============================================================================
[docs]class MigrateCDX(object): def __init__(self, dir_): self.cdx_dir = dir_
[docs] def iter_cdx_files(self): for root, dirs, files in os.walk(self.cdx_dir): for filename in files: if filename.endswith('.cdx'): full_path = os.path.join(root, filename) yield full_path
[docs] def count_cdx(self): count = 0 for x in self.iter_cdx_files(): count += 1 return count
[docs] def convert_to_cdxj(self): cdxj_writer = CDXJ() for filename in self.iter_cdx_files(): outfile = filename + 'j' print('Converting {0} -> {1}'.format(filename, outfile)) with open(outfile + '.tmp', 'w+') as out: with open(filename, 'rb') as fh: for line in fh: if line.startswith(b' CDX'): continue cdx = CDXObject(line) cdx[URLKEY] = canonicalize(cdx[ORIGINAL]) cdxj_writer.write_cdx_line(out, cdx, cdx['filename']) shutil.move(outfile + '.tmp', outfile) os.remove(filename)