|
""" |
|
This module contains various general utility functions. |
|
""" |
|
|
|
from __future__ import with_statement |
|
|
|
import logging |
|
logger = logging.getLogger('gensim.utils') |
|
|
|
try: |
|
from html.entities import name2codepoint as n2cp |
|
except ImportError: |
|
from htmlentitydefs import name2codepoint as n2cp |
|
try: |
|
import cPickle as _pickle |
|
except ImportError: |
|
import pickle as _pickle |
|
|
|
import re |
|
import unicodedata |
|
import os |
|
import random |
|
import itertools |
|
import tempfile |
|
from functools import wraps # for `synchronous` function lock |
|
import multiprocessing |
|
import shutil |
|
import sys |
|
import traceback |
|
from contextlib import contextmanager |
|
|
|
import numpy |
|
import scipy.sparse |
|
|
|
if sys.version_info[0] >= 3: |
|
unicode = str |
|
|
|
from six import iteritems, u, string_types |
|
from six.moves import xrange |
|
|
|
try: |
|
from pattern.en import parse |
|
logger.info("'pattern' package found; utils.lemmatize() is available for English") |
|
HAS_PATTERN = True |
|
except ImportError: |
|
HAS_PATTERN = False |
|
|
|
|
|
PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) |
|
RE_HTML_ENTITY = re.compile(r'&(#?)(x?)(\w+);', re.UNICODE) |
|
|
|
|
|
|
|
def synchronous(tlockname): |
|
""" |
|
A decorator to place an instance-based lock around a method. |
|
Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ |
|
""" |
|
def _synched(func): |
|
@wraps(func) |
|
def _synchronizer(self, *args, **kwargs): |
|
tlock = getattr(self, tlockname) |
|
logger.debug("acquiring lock %r for %s" % (tlockname, func.func_name)) |
|
|
|
with tlock: # use lock as a context manager to perform safe acquire/release pairs |
|
logger.debug("acquired lock %r for %s" % (tlockname, func.func_name)) |
|
result = func(self, *args, **kwargs) |
|
logger.debug("releasing lock %r for %s" % (tlockname, func.func_name)) |
|
return result |
|
return _synchronizer |
|
return _synched |
|
|
|
|
|
class NoCM(object): |
|
def acquire(self): |
|
pass |
|
def release(self): |
|
pass |
|
def __enter__(self): |
|
pass |
|
def __exit__(self, type, value, traceback): |
|
pass |
|
nocm = NoCM() |
|
|
|
|
|
@contextmanager |
|
def file_or_filename(input): |
|
""" |
|
Return a file-like object ready to be read from the beginning. `input` is either |
|
a filename (gz/bz2 also supported) or a file-like object supporting seek. |
|
""" |
|
if isinstance(input, string_types): |
|
# input was a filename: open as text file |
|
with smart_open(input) as fin: |
|
yield fin |
|
else: |
|
input.seek(0) |
|
yield input |
|
|
|
|
|
def deaccent(text): |
|
""" |
|
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. |
|
Return input string with accents removed, as unicode. |
|
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") |
|
u'Sef chomutovskych komunistu dostal postou bily prasek' |
|
""" |
|
if not isinstance(text, unicode): |
|
# assume utf8 for byte strings, use default (strict) error handling |
|
text = text.decode('utf8') |
|
norm = unicodedata.normalize("NFD", text) |
|
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') |
|
return unicodedata.normalize("NFC", result) |
|
|
|
|
|
def copytree_hardlink(source, dest): |
|
""" |
|
Recursively copy a directory ala shutils.copytree, but hardlink files |
|
instead of copying. Available on UNIX systems only. |
|
""" |
|
copy2 = shutil.copy2 |
|
try: |
|
shutil.copy2 = os.link |
|
shutil.copytree(source, dest) |
|
finally: |
|
shutil.copy2 = copy2 |
|
|
|
|
|
def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False): |
|
""" |
|
Iteratively yield tokens as unicode strings, optionally also lowercasing them |
|
and removing accent marks. |
|
Input text may be either unicode or utf8-encoded byte string. |
|
The tokens on output are maximal contiguous sequences of alphabetic |
|
characters (no digits!). |
|
>>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) |
|
[u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] |
|
""" |
|
lowercase = lowercase or to_lower or lower |
|
text = to_unicode(text, errors=errors) |
|
if lowercase: |
|
text = text.lower() |
|
if deacc: |
|
text = deaccent(text) |
|
for match in PAT_ALPHABETIC.finditer(text): |
|
yield match.group() |
|
|
|
|
|
def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): |
|
""" |
|
Convert a document into a list of tokens. |
|
This lowercases, tokenizes, stems, normalizes etc. -- the output are final |
|
tokens = unicode strings, that won't be processed any further. |
|
""" |
|
tokens = [token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore') |
|
if min_len <= len(token) <= max_len and not token.startswith('_')] |
|
return tokens |
|
|
|
|
|
def any2utf8(text, errors='strict', encoding='utf8'): |
|
"""Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" |
|
if isinstance(text, unicode): |
|
return text.encode('utf8') |
|
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 |
|
return unicode(text, encoding, errors=errors).encode('utf8') |
|
to_utf8 = any2utf8 |
|
|
|
|
|
def any2unicode(text, encoding='utf8', errors='strict'): |
|
"""Convert a string (bytestring in `encoding` or unicode), to unicode.""" |
|
if isinstance(text, unicode): |
|
return text |
|
return unicode(text, encoding, errors=errors) |
|
to_unicode = any2unicode |
|
|
|
|
|
class SaveLoad(object): |
|
""" |
|
Objects which inherit from this class have save/load functions, which un/pickle |
|
them to disk. |
|
This uses pickle for de/serializing, so objects must not contain |
|
unpicklable attributes, such as lambda functions etc. |
|
""" |
|
@classmethod |
|
def load(cls, fname, mmap=None): |
|
""" |
|
Load a previously saved object from file (also see `save`). |
|
If the object was saved with large arrays stored separately, you can load |
|
these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use |
|
mmap, load large arrays as normal objects. |
|
""" |
|
logger.info("loading %s object from %s" % (cls.__name__, fname)) |
|
subname = lambda suffix: fname + '.' + suffix + '.npy' |
|
obj = unpickle(fname) |
|
for attrib in getattr(obj, '__numpys', []): |
|
logger.info("loading %s from %s with mmap=%s" % (attrib, subname(attrib), mmap)) |
|
setattr(obj, attrib, numpy.load(subname(attrib), mmap_mode=mmap)) |
|
for attrib in getattr(obj, '__scipys', []): |
|
logger.info("loading %s from %s with mmap=%s" % (attrib, subname(attrib), mmap)) |
|
sparse = unpickle(subname(attrib)) |
|
sparse.data = numpy.load(subname(attrib) + '.data.npy', mmap_mode=mmap) |
|
sparse.indptr = numpy.load(subname(attrib) + '.indptr.npy', mmap_mode=mmap) |
|
sparse.indices = numpy.load(subname(attrib) + '.indices.npy', mmap_mode=mmap) |
|
setattr(obj, attrib, sparse) |
|
for attrib in getattr(obj, '__ignoreds', []): |
|
logger.info("setting ignored attribute %s to None" % (attrib)) |
|
setattr(obj, attrib, None) |
|
return obj |
|
|
|
def save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset()): |
|
""" |
|
Save the object to file (also see `load`). |
|
If `separately` is None, automatically detect large numpy/scipy.sparse arrays |
|
in the object being stored, and store them into separate files. This avoids |
|
pickle memory errors and allows mmap'ing large arrays back on load efficiently. |
|
You can also set `separately` manually, in which case it must be a list of attribute |
|
names to be stored in separate files. The automatic check is not performed in this case. |
|
`ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On |
|
subsequent load() these attributes will be set to None. |
|
""" |
|
logger.info("saving %s object under %s, separately %s" % (self.__class__.__name__, fname, separately)) |
|
subname = lambda suffix: fname + '.' + suffix + '.npy' |
|
tmp = {} |
|
if separately is None: |
|
separately = [] |
|
for attrib, val in iteritems(self.__dict__): |
|
if isinstance(val, numpy.ndarray) and val.size >= sep_limit: |
|
separately.append(attrib) |
|
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and val.nnz >= sep_limit: |
|
separately.append(attrib) |
|
|
|
# whatever's in `separately` or `ignore` at this point won't get pickled anymore |
|
for attrib in separately + list(ignore): |
|
if hasattr(self, attrib): |
|
tmp[attrib] = getattr(self, attrib) |
|
delattr(self, attrib) |
|
|
|
try: |
|
numpys, scipys, ignoreds = [], [], [] |
|
for attrib, val in iteritems(tmp): |
|
if isinstance(val, numpy.ndarray) and attrib not in ignore: |
|
numpys.append(attrib) |
|
logger.info("storing numpy array '%s' to %s" % (attrib, subname(attrib))) |
|
numpy.save(subname(attrib), numpy.ascontiguousarray(val)) |
|
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: |
|
scipys.append(attrib) |
|
logger.info("storing scipy.sparse array '%s' under %s" % (attrib, subname(attrib))) |
|
numpy.save(subname(attrib) + '.data.npy', val.data) |
|
numpy.save(subname(attrib) + '.indptr.npy', val.indptr) |
|
numpy.save(subname(attrib) + '.indices.npy', val.indices) |
|
data, indptr, indices = val.data, val.indptr, val.indices |
|
val.data, val.indptr, val.indices = None, None, None |
|
try: |
|
pickle(val, subname(attrib)) # store array-less object |
|
finally: |
|
val.data, val.indptr, val.indices = data, indptr, indices |
|
else: |
|
logger.info("not storing attribute %s" % (attrib)) |
|
ignoreds.append(attrib) |
|
self.__dict__['__numpys'] = numpys |
|
self.__dict__['__scipys'] = scipys |
|
self.__dict__['__ignoreds'] = ignoreds |
|
pickle(self, fname) |
|
finally: |
|
# restore the attributes |
|
for attrib, val in iteritems(tmp): |
|
setattr(self, attrib, val) |
|
#endclass SaveLoad |
|
|
|
|
|
def identity(p): |
|
"""Identity fnc, for flows that don't accept lambda (picking etc).""" |
|
return p |
|
|
|
|
|
def get_max_id(corpus): |
|
""" |
|
Return the highest feature id that appears in the corpus. |
|
For empty corpora (no features at all), return -1. |
|
""" |
|
maxid = -1 |
|
for document in corpus: |
|
maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty) |
|
return maxid |
|
|
|
|
|
class FakeDict(object): |
|
""" |
|
Objects of this class act as dictionaries that map integer->str(integer), for |
|
a specified range of integers <0, num_terms). |
|
This is meant to avoid allocating real dictionaries when `num_terms` is huge, which |
|
is a waste of memory. |
|
""" |
|
def __init__(self, num_terms): |
|
self.num_terms = num_terms |
|
|
|
|
|
def __str__(self): |
|
return "FakeDict(num_terms=%s)" % self.num_terms |
|
|
|
|
|
def __getitem__(self, val): |
|
if 0 <= val < self.num_terms: |
|
return str(val) |
|
raise ValueError("internal id out of bounds (%s, expected <0..%s))" % |
|
(val, self.num_terms)) |
|
|
|
def iteritems(self): |
|
for i in xrange(self.num_terms): |
|
yield i, str(i) |
|
|
|
def keys(self): |
|
""" |
|
Override the dict.keys() function, which is used to determine the maximum |
|
internal id of a corpus = the vocabulary dimensionality. |
|
HACK: To avoid materializing the whole `range(0, self.num_terms)`, this returns |
|
the highest id = `[self.num_terms - 1]` only. |
|
""" |
|
return [self.num_terms - 1] |
|
|
|
def __len__(self): |
|
return self.num_terms |
|
|
|
def get(self, val, default=None): |
|
if 0 <= val < self.num_terms: |
|
return str(val) |
|
return default |
|
|
|
|
|
def dict_from_corpus(corpus): |
|
""" |
|
Scan corpus for all word ids that appear in it, then construct and return a mapping |
|
which maps each ``wordId -> str(wordId)``. |
|
This function is used whenever *words* need to be displayed (as opposed to just |
|
their ids) but no wordId->word mapping was provided. The resulting mapping |
|
only covers words actually used in the corpus, up to the highest wordId found. |
|
""" |
|
num_terms = 1 + get_max_id(corpus) |
|
id2word = FakeDict(num_terms) |
|
return id2word |
|
|
|
|
|
def is_corpus(obj): |
|
""" |
|
Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where |
|
`new is obj` if `obj` was an iterable, or `new` yields the same sequence as |
|
`obj` if it was an iterator. |
|
`obj` is a corpus if it supports iteration over documents, where a document |
|
is in turn anything that acts as a sequence of 2-tuples (int, float). |
|
Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the |
|
result is forcefully defined as `is_corpus=False`. |
|
""" |
|
try: |
|
if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack |
|
return True, obj |
|
except: |
|
pass |
|
try: |
|
if hasattr(obj, 'next'): |
|
# the input is an iterator object, meaning once we call next() |
|
# that element could be gone forever. we must be careful to put |
|
# whatever we retrieve back again |
|
doc1 = next(obj) |
|
obj = itertools.chain([doc1], obj) |
|
else: |
|
doc1 = next(iter(obj)) # empty corpus is resolved to False here |
|
if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) |
|
return True, obj # the first document is empty=>assume this is a corpus |
|
id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here |
|
id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) |
|
except: |
|
return False, obj |
|
return True, obj |
|
|
|
|
|
|
|
def get_my_ip(): |
|
""" |
|
Try to obtain our external ip (from the pyro nameserver's point of view) |
|
This tries to sidestep the issue of bogus `/etc/hosts` entries and other |
|
local misconfigurations, which often mess up hostname resolution. |
|
If all else fails, fall back to simple `socket.gethostbyname()` lookup. |
|
""" |
|
import socket |
|
try: |
|
import Pyro4 |
|
# we know the nameserver must exist, so use it as our anchor point |
|
ns = Pyro4.naming.locateNS() |
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) |
|
s.connect((ns._pyroUri.host, ns._pyroUri.port)) |
|
result, port = s.getsockname() |
|
except: |
|
try: |
|
# see what ifconfig says about our default interface |
|
import commands |
|
result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] |
|
if len(result.split('.')) != 4: |
|
raise Exception() |
|
except: |
|
# give up, leave the resolution to gethostbyname |
|
result = socket.gethostbyname(socket.gethostname()) |
|
return result |
|
|
|
|
|
class RepeatCorpus(SaveLoad): |
|
""" |
|
Used in the tutorial on distributed computing and likely not useful anywhere else. |
|
""" |
|
def __init__(self, corpus, reps): |
|
""" |
|
Wrap a `corpus` as another corpus of length `reps`. This is achieved by |
|
repeating documents from `corpus` over and over again, until the requested |
|
length `len(result)==reps` is reached. Repetition is done |
|
on-the-fly=efficiently, via `itertools`. |
|
>>> corpus = [[(1, 0.5)], []] # 2 documents |
|
>>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents |
|
[[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]] |
|
""" |
|
self.corpus = corpus |
|
self.reps = reps |
|
|
|
def __iter__(self): |
|
return itertools.islice(itertools.cycle(self.corpus), self.reps) |
|
|
|
class ClippedCorpus(SaveLoad): |
|
def __init__(self, corpus, max_docs=None): |
|
""" |
|
Return a corpus that is the "head" of input iterable `corpus`. |
|
Any documents after `max_docs` are ignored. This effectively limits the |
|
length of the returned corpus to <= `max_docs`. Set `max_docs=None` for |
|
"no limit", effectively wrapping the entire input corpus. |
|
""" |
|
self.corpus = corpus |
|
self.max_docs = max_docs |
|
|
|
def __iter__(self): |
|
return itertools.islice(self.corpus, self.max_docs) |
|
|
|
def __len__(self): |
|
return min(self.max_docs, len(self.corpus)) |
|
|
|
def decode_htmlentities(text): |
|
""" |
|
Decode HTML entities in text, coded as hex, decimal or named. |
|
Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py |
|
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' |
|
>>> print(decode_htmlentities(u).encode('UTF-8')) |
|
E tu vivrai nel terrore - L'aldilà (1981) |
|
>>> print(decode_htmlentities("l'eau")) |
|
l'eau |
|
>>> print(decode_htmlentities("foo < bar")) |
|
foo < bar |
|
""" |
|
def substitute_entity(match): |
|
ent = match.group(3) |
|
if match.group(1) == "#": |
|
# decoding by number |
|
if match.group(2) == '': |
|
# number is in decimal |
|
return unichr(int(ent)) |
|
elif match.group(2) == 'x': |
|
# number is in hex |
|
return unichr(int('0x' + ent, 16)) |
|
else: |
|
# they were using a name |
|
cp = n2cp.get(ent) |
|
if cp: |
|
return unichr(cp) |
|
else: |
|
return match.group() |
|
|
|
try: |
|
return RE_HTML_ENTITY.sub(substitute_entity, text) |
|
except: |
|
# in case of errors, return input |
|
# e.g., ValueError: unichr() arg not in range(0x10000) (narrow Python build) |
|
return text |
|
|
|
|
|
def chunkize_serial(iterable, chunksize, as_numpy=False): |
|
""" |
|
Return elements from the iterable in `chunksize`-ed lists. The last returned |
|
element may be smaller (if length of collection is not divisible by `chunksize`). |
|
>>> print(list(grouper(range(10), 3))) |
|
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] |
|
""" |
|
import numpy |
|
it = iter(iterable) |
|
while True: |
|
if as_numpy: |
|
# convert each document to a 2d numpy array (~6x faster when transmitting |
|
# chunk data over the wire, in Pyro) |
|
wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] |
|
else: |
|
wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] |
|
if not wrapped_chunk[0]: |
|
break |
|
# memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference |
|
yield wrapped_chunk.pop() |
|
|
|
grouper = chunkize_serial |
|
|
|
|
|
|
|
class InputQueue(multiprocessing.Process): |
|
def __init__(self, q, corpus, chunksize, maxsize, as_numpy): |
|
super(InputQueue, self).__init__() |
|
self.q = q |
|
self.maxsize = maxsize |
|
self.corpus = corpus |
|
self.chunksize = chunksize |
|
self.as_numpy = as_numpy |
|
|
|
def run(self): |
|
if self.as_numpy: |
|
import numpy # don't clutter the global namespace with a dependency on numpy |
|
it = iter(self.corpus) |
|
while True: |
|
chunk = itertools.islice(it, self.chunksize) |
|
if self.as_numpy: |
|
# HACK XXX convert documents to numpy arrays, to save memory. |
|
# This also gives a scipy warning at runtime: |
|
# "UserWarning: indices array has non-integer dtype (float64)" |
|
wrapped_chunk = [[numpy.asarray(doc) for doc in chunk]] |
|
else: |
|
wrapped_chunk = [list(chunk)] |
|
|
|
if not wrapped_chunk[0]: |
|
self.q.put(None, block=True) |
|
break |
|
|
|
try: |
|
qsize = self.q.qsize() |
|
except NotImplementedError: |
|
qsize = '?' |
|
logger.debug("prepared another chunk of %i documents (qsize=%s)" % |
|
(len(wrapped_chunk[0]), qsize)) |
|
self.q.put(wrapped_chunk.pop(), block=True) |
|
#endclass InputQueue |
|
|
|
|
|
if os.name == 'nt': |
|
logger.info("detected Windows; aliasing chunkize to chunkize_serial") |
|
|
|
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): |
|
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): |
|
yield chunk |
|
else: |
|
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): |
|
""" |
|
Split a stream of values into smaller chunks. |
|
Each chunk is of length `chunksize`, except the last one which may be smaller. |
|
A once-only input stream (`corpus` from a generator) is ok, chunking is done |
|
efficiently via itertools. |
|
If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but |
|
rather keep filling a short queue (of size at most `maxsize`) with forthcoming |
|
chunks in advance. This is realized by starting a separate process, and is |
|
meant to reduce I/O delays, which can be significant when `corpus` comes |
|
from a slow medium (like harddisk). |
|
If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize |
|
via `chunkize_serial()` (no I/O optimizations). |
|
>>> for chunk in chunkize(range(10), 4): print(chunk) |
|
[0, 1, 2, 3] |
|
[4, 5, 6, 7] |
|
[8, 9] |
|
""" |
|
assert chunksize > 0 |
|
|
|
if maxsize > 0: |
|
q = multiprocessing.Queue(maxsize=maxsize) |
|
worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) |
|
worker.daemon = True |
|
worker.start() |
|
while True: |
|
chunk = [q.get(block=True)] |
|
if chunk[0] is None: |
|
break |
|
yield chunk.pop() |
|
else: |
|
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): |
|
yield chunk |
|
|
|
|
|
def make_closing(base, **attrs): |
|
""" |
|
Add support for `with Base(attrs) as fout:` to the base class if it's missing. |
|
The base class' `close()` method will be called on context exit, to always close the file properly. |
|
This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise |
|
raise "AttributeError: GzipFile instance has no attribute '__exit__'". |
|
""" |
|
if not hasattr(base, '__enter__'): |
|
attrs['__enter__'] = lambda self: self |
|
if not hasattr(base, '__exit__'): |
|
attrs['__exit__'] = lambda self, type, value, traceback: self.close() |
|
return type('Closing' + base.__name__, (base, object), attrs) |
|
|
|
|
|
def smart_open(fname, mode='rb'): |
|
_, ext = os.path.splitext(fname) |
|
if ext == '.bz2': |
|
from bz2 import BZ2File |
|
return make_closing(BZ2File)(fname, mode) |
|
if ext == '.gz': |
|
from gzip import GzipFile |
|
return make_closing(GzipFile)(fname, mode) |
|
return open(fname, mode) |
|
|
|
|
|
def pickle(obj, fname, protocol=-1): |
|
"""Pickle object `obj` to file `fname`.""" |
|
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows |
|
_pickle.dump(obj, fout, protocol=protocol) |
|
|
|
|
|
def unpickle(fname): |
|
"""Load pickled object from `fname`""" |
|
with smart_open(fname) as f: |
|
return _pickle.load(f) |
|
|
|
|
|
def revdict(d): |
|
""" |
|
Reverse a dictionary mapping. |
|
When two keys map to the same value, only one of them will be kept in the |
|
result (which one is kept is arbitrary). |
|
""" |
|
return dict((v, k) for (k, v) in iteritems(d)) |
|
|
|
|
|
def toptexts(query, texts, index, n=10): |
|
""" |
|
Debug fnc to help inspect the top `n` most similar documents (according to a |
|
similarity index `index`), to see if they are actually related to the query. |
|
`texts` is any object that can return something insightful for each document |
|
via `texts[docid]`, such as its fulltext or snippet. |
|
Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]). |
|
""" |
|
sims = index[query] # perform a similarity query against the corpus |
|
sims = sorted(enumerate(sims), key=lambda item: -item[1]) |
|
|
|
result = [] |
|
for topid, topcosine in sims[:n]: # only consider top-n most similar docs |
|
result.append((topid, topcosine, texts[topid])) |
|
return result |
|
|
|
|
|
def randfname(prefix='gensim'): |
|
randpart = hex(random.randint(0, 0xffffff))[2:] |
|
return os.path.join(tempfile.gettempdir(), prefix + randpart) |
|
|
|
|
|
def upload_chunked(server, docs, chunksize=1000, preprocess=None): |
|
""" |
|
Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). |
|
Use this function to train or index large collections -- avoid sending the |
|
entire corpus over the wire as a single Pyro in-memory object. The documents |
|
will be sent in smaller chunks, of `chunksize` documents each. |
|
""" |
|
start = 0 |
|
for chunk in grouper(docs, chunksize): |
|
end = start + len(chunk) |
|
logger.info("uploading documents %i-%i" % (start, end - 1)) |
|
if preprocess is not None: |
|
pchunk = [] |
|
for doc in chunk: |
|
doc['tokens'] = preprocess(doc['text']) |
|
del doc['text'] |
|
pchunk.append(doc) |
|
chunk = pchunk |
|
server.buffer(chunk) |
|
start = end |
|
|
|
|
|
def getNS(): |
|
""" |
|
Return a Pyro name server proxy. If there is no name server running, |
|
start one on 0.0.0.0 (all interfaces), as a background process. |
|
""" |
|
import Pyro4 |
|
try: |
|
return Pyro4.locateNS() |
|
except Pyro4.errors.NamingError: |
|
logger.info("Pyro name server not found; starting a new one") |
|
os.system("python -m Pyro4.naming -n 0.0.0.0 &") |
|
# TODO: spawn a proper daemon ala http://code.activestate.com/recipes/278731/ ? |
|
# like this, if there's an error somewhere, we'll never know... (and the loop |
|
# below will block). And it probably doesn't work on windows, either. |
|
while True: |
|
try: |
|
return Pyro4.locateNS() |
|
except: |
|
pass |
|
|
|
|
|
def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None): |
|
""" |
|
Register object with name server (starting the name server if not running |
|
yet) and block until the daemon is terminated. The object is registered under |
|
`name`, or `name`+ some random suffix if `random_suffix` is set. |
|
""" |
|
if random_suffix: |
|
name += '.' + hex(random.randint(0, 0xffffff))[2:] |
|
import Pyro4 |
|
with getNS() as ns: |
|
with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon: |
|
# register server for remote access |
|
uri = daemon.register(obj, name) |
|
ns.remove(name) |
|
ns.register(name, uri) |
|
logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) |
|
daemon.requestLoop() |
|
|
|
|
|
if HAS_PATTERN: |
|
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()): |
|
""" |
|
This function is only available when the optional 'pattern' package is installed. |
|
Use the English lemmatizer from `pattern` to extract tokens in |
|
their base form=lemma, e.g. "are, is, being" -> "be" etc. |
|
This is a smarter version of stemming, taking word context into account. |
|
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). |
|
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') |
|
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] |
|
>>> lemmatize('The study ranks high.') |
|
['study/NN', 'rank/VB', 'high/JJ'] |
|
>>> lemmatize('The ranks study hard.') |
|
['rank/NN', 'study/VB', 'hard/RB'] |
|
""" |
|
if light: |
|
import warnings |
|
warnings.warn("The light flag is no longer supported by pattern.") |
|
|
|
# tokenization in `pattern` is weird; it gets thrown off by non-letters, |
|
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little |
|
# FIXME this throws away all fancy parsing cues, including sentence structure, |
|
# abbreviations etc. |
|
content = u(' ').join(tokenize(content, lower=True, errors='ignore')) |
|
|
|
parsed = parse(content, lemmata=True, collapse=False) |
|
result = [] |
|
for sentence in parsed: |
|
for token, tag, _, _, lemma in sentence: |
|
if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords: |
|
if allowed_tags.match(tag): |
|
lemma += "/" + tag[:2] |
|
result.append(lemma.encode('utf8')) |
|
return result |