Source code for pywb.utils.binsearch

"""
Utility functions for performing binary search over a sorted text file
"""

from collections import deque
import itertools
import six

import sys

if six.PY3:
[docs] def cmp(a, b): return (a > b) - (a < b)
#=================================================================
[docs]def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): """ Find offset of the line which matches a given 'key' using binary search If key is not found, the offset is of the line after the key File is subdivided into block_size (default 8192) sized blocks Optional compare_func may be specified """ min_ = 0 reader.seek(0, 2) max_ = int(reader.tell() / block_size) while max_ - min_ > 1: mid = int(min_ + ((max_ - min_) / 2)) reader.seek(mid * block_size) if mid > 0: reader.readline() # skip partial line line = reader.readline() if compare_func(key, line) > 0: min_ = mid else: max_ = mid return min_ * block_size
#=================================================================
[docs]def binsearch(reader, key, compare_func=cmp, block_size=8192): """ Perform a binary search for a specified key to within a 'block_size' (default 8192) granularity, and return first full line found. """ min_ = binsearch_offset(reader, key, compare_func, block_size) reader.seek(min_) if min_ > 0: reader.readline() # skip partial line def gen_iter(line): while line: yield line.rstrip() line = reader.readline() return gen_iter(reader.readline())
#=================================================================
[docs]def linearsearch(iter_, key, prev_size=0, compare_func=cmp): """ Perform a linear search over iterator until current_line >= key optionally also tracking upto N previous lines, which are returned before the first matched line. if end of stream is reached before a match is found, nothing is returned (prev lines discarded also) """ prev_deque = deque(maxlen=prev_size + 1) matched = False for line in iter_: prev_deque.append(line) if compare_func(line, key) >= 0: matched = True break # no matches, so pop last line, but return rest of prev lines, if any if not matched: if not prev_size or len(prev_deque) <= 1: return iter([]) prev_deque.popleft() return iter(prev_deque) return itertools.chain(prev_deque, iter_)
#================================================================= #=================================================================
[docs]def iter_range(reader, start, end, prev_size=0): """ Creates an iterator which iterates over lines where start <= line < end (end exclusive) """ iter_ = search(reader, start, prev_size=prev_size) end_iter = itertools.takewhile( lambda line: line < end, iter_) return end_iter
#=================================================================
[docs]def iter_prefix(reader, key): """ Creates an iterator which iterates over lines that start with prefix 'key' in a sorted text file. """ return itertools.takewhile( lambda line: line.startswith(key), search(reader, key))
#=================================================================
[docs]def iter_exact(reader, key, token=b' '): """ Create an iterator which iterates over lines where the first field matches the 'key', equivalent to token + sep prefix. Default field termin_ator/seperator is ' ' """ return iter_prefix(reader, key + token)