| """ | 
 | This module contains various general utility functions. | 
 | """ | 
 |  | 
 | from __future__ import with_statement | 
 |  | 
 | import logging | 
 | logger = logging.getLogger('gensim.utils') | 
 |  | 
 | try: | 
 |  from html.entities import name2codepoint as n2cp | 
 | except ImportError: | 
 |  from htmlentitydefs import name2codepoint as n2cp | 
 | try: | 
 |  import cPickle as _pickle | 
 | except ImportError: | 
 |  import pickle as _pickle | 
 |  | 
 | import re | 
 | import unicodedata | 
 | import os | 
 | import random | 
 | import itertools | 
 | import tempfile | 
 | from functools import wraps # for `synchronous` function lock | 
 | import multiprocessing | 
 | import shutil | 
 | import sys | 
 | import traceback | 
 | from contextlib import contextmanager | 
 |  | 
 | import numpy | 
 | import scipy.sparse | 
 |  | 
 | if sys.version_info[0] >= 3: | 
 |  unicode = str | 
 |  | 
 | from six import iteritems, u, string_types | 
 | from six.moves import xrange | 
 |  | 
 | try: | 
 |  from pattern.en import parse | 
 |  logger.info("'pattern' package found; utils.lemmatize() is available for English") | 
 |  HAS_PATTERN = True | 
 | except ImportError: | 
 |  HAS_PATTERN = False | 
 |  | 
 |  | 
 | PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) | 
 | RE_HTML_ENTITY = re.compile(r'&(#?)(x?)(\w+);', re.UNICODE) | 
 |  | 
 |  | 
 |  | 
 | def synchronous(tlockname): | 
 |  """ | 
 |  A decorator to place an instance-based lock around a method. | 
 |  Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ | 
 |  """ | 
 |  def _synched(func): | 
 |  @wraps(func) | 
 |  def _synchronizer(self, *args, **kwargs): | 
 |  tlock = getattr(self, tlockname) | 
 |  logger.debug("acquiring lock %r for %s" % (tlockname, func.func_name)) | 
 |  | 
 |  with tlock: # use lock as a context manager to perform safe acquire/release pairs | 
 |  logger.debug("acquired lock %r for %s" % (tlockname, func.func_name)) | 
 |  result = func(self, *args, **kwargs) | 
 |  logger.debug("releasing lock %r for %s" % (tlockname, func.func_name)) | 
 |  return result | 
 |  return _synchronizer | 
 |  return _synched | 
 |  | 
 |  | 
 | class NoCM(object): | 
 |  def acquire(self): | 
 |  pass | 
 |  def release(self): | 
 |  pass | 
 |  def __enter__(self): | 
 |  pass | 
 |  def __exit__(self, type, value, traceback): | 
 |  pass | 
 | nocm = NoCM() | 
 |  | 
 |  | 
 | @contextmanager | 
 | def file_or_filename(input): | 
 |  """ | 
 |  Return a file-like object ready to be read from the beginning. `input` is either | 
 |  a filename (gz/bz2 also supported) or a file-like object supporting seek. | 
 |  """ | 
 |  if isinstance(input, string_types): | 
 |  # input was a filename: open as text file | 
 |  with smart_open(input) as fin: | 
 |  yield fin | 
 |  else: | 
 |  input.seek(0) | 
 |  yield input | 
 |  | 
 |  | 
 | def deaccent(text): | 
 |  """ | 
 |  Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. | 
 |  Return input string with accents removed, as unicode. | 
 |  >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") | 
 |  u'Sef chomutovskych komunistu dostal postou bily prasek' | 
 |  """ | 
 |  if not isinstance(text, unicode): | 
 |  # assume utf8 for byte strings, use default (strict) error handling | 
 |  text = text.decode('utf8') | 
 |  norm = unicodedata.normalize("NFD", text) | 
 |  result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') | 
 |  return unicodedata.normalize("NFC", result) | 
 |  | 
 |  | 
 | def copytree_hardlink(source, dest): | 
 |  """ | 
 |  Recursively copy a directory ala shutils.copytree, but hardlink files | 
 |  instead of copying. Available on UNIX systems only. | 
 |  """ | 
 |  copy2 = shutil.copy2 | 
 |  try: | 
 |  shutil.copy2 = os.link | 
 |  shutil.copytree(source, dest) | 
 |  finally: | 
 |  shutil.copy2 = copy2 | 
 |  | 
 |  | 
 | def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False): | 
 |  """ | 
 |  Iteratively yield tokens as unicode strings, optionally also lowercasing them | 
 |  and removing accent marks. | 
 |  Input text may be either unicode or utf8-encoded byte string. | 
 |  The tokens on output are maximal contiguous sequences of alphabetic | 
 |  characters (no digits!). | 
 |  >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) | 
 |  [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] | 
 |  """ | 
 |  lowercase = lowercase or to_lower or lower | 
 |  text = to_unicode(text, errors=errors) | 
 |  if lowercase: | 
 |  text = text.lower() | 
 |  if deacc: | 
 |  text = deaccent(text) | 
 |  for match in PAT_ALPHABETIC.finditer(text): | 
 |  yield match.group() | 
 |  | 
 |  | 
 | def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): | 
 |  """ | 
 |  Convert a document into a list of tokens. | 
 |  This lowercases, tokenizes, stems, normalizes etc. -- the output are final | 
 |  tokens = unicode strings, that won't be processed any further. | 
 |  """ | 
 |  tokens = [token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore') | 
 |  if min_len <= len(token) <= max_len and not token.startswith('_')] | 
 |  return tokens | 
 |  | 
 |  | 
 | def any2utf8(text, errors='strict', encoding='utf8'): | 
 |  """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" | 
 |  if isinstance(text, unicode): | 
 |  return text.encode('utf8') | 
 |  # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 | 
 |  return unicode(text, encoding, errors=errors).encode('utf8') | 
 | to_utf8 = any2utf8 | 
 |  | 
 |  | 
 | def any2unicode(text, encoding='utf8', errors='strict'): | 
 |  """Convert a string (bytestring in `encoding` or unicode), to unicode.""" | 
 |  if isinstance(text, unicode): | 
 |  return text | 
 |  return unicode(text, encoding, errors=errors) | 
 | to_unicode = any2unicode | 
 |  | 
 |  | 
 | class SaveLoad(object): | 
 |  """ | 
 |  Objects which inherit from this class have save/load functions, which un/pickle | 
 |  them to disk. | 
 |  This uses pickle for de/serializing, so objects must not contain | 
 |  unpicklable attributes, such as lambda functions etc. | 
 |  """ | 
 |  @classmethod | 
 |  def load(cls, fname, mmap=None): | 
 |  """ | 
 |  Load a previously saved object from file (also see `save`). | 
 |  If the object was saved with large arrays stored separately, you can load | 
 |  these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use | 
 |  mmap, load large arrays as normal objects. | 
 |  """ | 
 |  logger.info("loading %s object from %s" % (cls.__name__, fname)) | 
 |  subname = lambda suffix: fname + '.' + suffix + '.npy' | 
 |  obj = unpickle(fname) | 
 |  for attrib in getattr(obj, '__numpys', []): | 
 |  logger.info("loading %s from %s with mmap=%s" % (attrib, subname(attrib), mmap)) | 
 |  setattr(obj, attrib, numpy.load(subname(attrib), mmap_mode=mmap)) | 
 |  for attrib in getattr(obj, '__scipys', []): | 
 |  logger.info("loading %s from %s with mmap=%s" % (attrib, subname(attrib), mmap)) | 
 |  sparse = unpickle(subname(attrib)) | 
 |  sparse.data = numpy.load(subname(attrib) + '.data.npy', mmap_mode=mmap) | 
 |  sparse.indptr = numpy.load(subname(attrib) + '.indptr.npy', mmap_mode=mmap) | 
 |  sparse.indices = numpy.load(subname(attrib) + '.indices.npy', mmap_mode=mmap) | 
 |  setattr(obj, attrib, sparse) | 
 |  for attrib in getattr(obj, '__ignoreds', []): | 
 |  logger.info("setting ignored attribute %s to None" % (attrib)) | 
 |  setattr(obj, attrib, None) | 
 |  return obj | 
 |  | 
 |  def save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset()): | 
 |  """ | 
 |  Save the object to file (also see `load`). | 
 |  If `separately` is None, automatically detect large numpy/scipy.sparse arrays | 
 |  in the object being stored, and store them into separate files. This avoids | 
 |  pickle memory errors and allows mmap'ing large arrays back on load efficiently. | 
 |  You can also set `separately` manually, in which case it must be a list of attribute | 
 |  names to be stored in separate files. The automatic check is not performed in this case. | 
 |  `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On | 
 |  subsequent load() these attributes will be set to None. | 
 |  """ | 
 |  logger.info("saving %s object under %s, separately %s" % (self.__class__.__name__, fname, separately)) | 
 |  subname = lambda suffix: fname + '.' + suffix + '.npy' | 
 |  tmp = {} | 
 |  if separately is None: | 
 |  separately = [] | 
 |  for attrib, val in iteritems(self.__dict__): | 
 |  if isinstance(val, numpy.ndarray) and val.size >= sep_limit: | 
 |  separately.append(attrib) | 
 |  elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and val.nnz >= sep_limit: | 
 |  separately.append(attrib) | 
 |  | 
 |  # whatever's in `separately` or `ignore` at this point won't get pickled anymore | 
 |  for attrib in separately + list(ignore): | 
 |  if hasattr(self, attrib): | 
 |  tmp[attrib] = getattr(self, attrib) | 
 |  delattr(self, attrib) | 
 |  | 
 |  try: | 
 |  numpys, scipys, ignoreds = [], [], [] | 
 |  for attrib, val in iteritems(tmp): | 
 |  if isinstance(val, numpy.ndarray) and attrib not in ignore: | 
 |  numpys.append(attrib) | 
 |  logger.info("storing numpy array '%s' to %s" % (attrib, subname(attrib))) | 
 |  numpy.save(subname(attrib), numpy.ascontiguousarray(val)) | 
 |  elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: | 
 |  scipys.append(attrib) | 
 |  logger.info("storing scipy.sparse array '%s' under %s" % (attrib, subname(attrib))) | 
 |  numpy.save(subname(attrib) + '.data.npy', val.data) | 
 |  numpy.save(subname(attrib) + '.indptr.npy', val.indptr) | 
 |  numpy.save(subname(attrib) + '.indices.npy', val.indices) | 
 |  data, indptr, indices = val.data, val.indptr, val.indices | 
 |  val.data, val.indptr, val.indices = None, None, None | 
 |  try: | 
 |  pickle(val, subname(attrib)) # store array-less object | 
 |  finally: | 
 |  val.data, val.indptr, val.indices = data, indptr, indices | 
 |  else: | 
 |  logger.info("not storing attribute %s" % (attrib)) | 
 |  ignoreds.append(attrib) | 
 |  self.__dict__['__numpys'] = numpys | 
 |  self.__dict__['__scipys'] = scipys | 
 |  self.__dict__['__ignoreds'] = ignoreds | 
 |  pickle(self, fname) | 
 |  finally: | 
 |  # restore the attributes | 
 |  for attrib, val in iteritems(tmp): | 
 |  setattr(self, attrib, val) | 
 | #endclass SaveLoad | 
 |  | 
 |  | 
 | def identity(p): | 
 |  """Identity fnc, for flows that don't accept lambda (picking etc).""" | 
 |  return p | 
 |  | 
 |  | 
 | def get_max_id(corpus): | 
 |  """ | 
 |  Return the highest feature id that appears in the corpus. | 
 |  For empty corpora (no features at all), return -1. | 
 |  """ | 
 |  maxid = -1 | 
 |  for document in corpus: | 
 |  maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty) | 
 |  return maxid | 
 |  | 
 |  | 
 | class FakeDict(object): | 
 |  """ | 
 |  Objects of this class act as dictionaries that map integer->str(integer), for | 
 |  a specified range of integers <0, num_terms). | 
 |  This is meant to avoid allocating real dictionaries when `num_terms` is huge, which | 
 |  is a waste of memory. | 
 |  """ | 
 |  def __init__(self, num_terms): | 
 |  self.num_terms = num_terms | 
 |  | 
 |  | 
 |  def __str__(self): | 
 |  return "FakeDict(num_terms=%s)" % self.num_terms | 
 |  | 
 |  | 
 |  def __getitem__(self, val): | 
 |  if 0 <= val < self.num_terms: | 
 |  return str(val) | 
 |  raise ValueError("internal id out of bounds (%s, expected <0..%s))" % | 
 |  (val, self.num_terms)) | 
 |  | 
 |  def iteritems(self): | 
 |  for i in xrange(self.num_terms): | 
 |  yield i, str(i) | 
 |  | 
 |  def keys(self): | 
 |  """ | 
 |  Override the dict.keys() function, which is used to determine the maximum | 
 |  internal id of a corpus = the vocabulary dimensionality. | 
 |  HACK: To avoid materializing the whole `range(0, self.num_terms)`, this returns | 
 |  the highest id = `[self.num_terms - 1]` only. | 
 |  """ | 
 |  return [self.num_terms - 1] | 
 |  | 
 |  def __len__(self): | 
 |  return self.num_terms | 
 |  | 
 |  def get(self, val, default=None): | 
 |  if 0 <= val < self.num_terms: | 
 |  return str(val) | 
 |  return default | 
 |  | 
 |  | 
 | def dict_from_corpus(corpus): | 
 |  """ | 
 |  Scan corpus for all word ids that appear in it, then construct and return a mapping | 
 |  which maps each ``wordId -> str(wordId)``. | 
 |  This function is used whenever *words* need to be displayed (as opposed to just | 
 |  their ids) but no wordId->word mapping was provided. The resulting mapping | 
 |  only covers words actually used in the corpus, up to the highest wordId found. | 
 |  """ | 
 |  num_terms = 1 + get_max_id(corpus) | 
 |  id2word = FakeDict(num_terms) | 
 |  return id2word | 
 |  | 
 |  | 
 | def is_corpus(obj): | 
 |  """ | 
 |  Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where | 
 |  `new is obj` if `obj` was an iterable, or `new` yields the same sequence as | 
 |  `obj` if it was an iterator. | 
 |  `obj` is a corpus if it supports iteration over documents, where a document | 
 |  is in turn anything that acts as a sequence of 2-tuples (int, float). | 
 |  Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the | 
 |  result is forcefully defined as `is_corpus=False`. | 
 |  """ | 
 |  try: | 
 |  if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack | 
 |  return True, obj | 
 |  except: | 
 |  pass | 
 |  try: | 
 |  if hasattr(obj, 'next'): | 
 |  # the input is an iterator object, meaning once we call next() | 
 |  # that element could be gone forever. we must be careful to put | 
 |  # whatever we retrieve back again | 
 |  doc1 = next(obj) | 
 |  obj = itertools.chain([doc1], obj) | 
 |  else: | 
 |  doc1 = next(iter(obj)) # empty corpus is resolved to False here | 
 |  if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) | 
 |  return True, obj # the first document is empty=>assume this is a corpus | 
 |  id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here | 
 |  id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) | 
 |  except: | 
 |  return False, obj | 
 |  return True, obj | 
 |  | 
 |  | 
 |  | 
 | def get_my_ip(): | 
 |  """ | 
 |  Try to obtain our external ip (from the pyro nameserver's point of view) | 
 |  This tries to sidestep the issue of bogus `/etc/hosts` entries and other | 
 |  local misconfigurations, which often mess up hostname resolution. | 
 |  If all else fails, fall back to simple `socket.gethostbyname()` lookup. | 
 |  """ | 
 |  import socket | 
 |  try: | 
 |  import Pyro4 | 
 |  # we know the nameserver must exist, so use it as our anchor point | 
 |  ns = Pyro4.naming.locateNS() | 
 |  s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) | 
 |  s.connect((ns._pyroUri.host, ns._pyroUri.port)) | 
 |  result, port = s.getsockname() | 
 |  except: | 
 |  try: | 
 |  # see what ifconfig says about our default interface | 
 |  import commands | 
 |  result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] | 
 |  if len(result.split('.')) != 4: | 
 |  raise Exception() | 
 |  except: | 
 |  # give up, leave the resolution to gethostbyname | 
 |  result = socket.gethostbyname(socket.gethostname()) | 
 |  return result | 
 |  | 
 |  | 
 | class RepeatCorpus(SaveLoad): | 
 |  """ | 
 |  Used in the tutorial on distributed computing and likely not useful anywhere else. | 
 |  """ | 
 |  def __init__(self, corpus, reps): | 
 |  """ | 
 |  Wrap a `corpus` as another corpus of length `reps`. This is achieved by | 
 |  repeating documents from `corpus` over and over again, until the requested | 
 |  length `len(result)==reps` is reached. Repetition is done | 
 |  on-the-fly=efficiently, via `itertools`. | 
 |  >>> corpus = [[(1, 0.5)], []] # 2 documents | 
 |  >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents | 
 |  [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]] | 
 |  """ | 
 |  self.corpus = corpus | 
 |  self.reps = reps | 
 |  | 
 |  def __iter__(self): | 
 |  return itertools.islice(itertools.cycle(self.corpus), self.reps) | 
 |  | 
 | class ClippedCorpus(SaveLoad): | 
 |  def __init__(self, corpus, max_docs=None): | 
 |  """ | 
 |  Return a corpus that is the "head" of input iterable `corpus`. | 
 |  Any documents after `max_docs` are ignored. This effectively limits the | 
 |  length of the returned corpus to <= `max_docs`. Set `max_docs=None` for | 
 |  "no limit", effectively wrapping the entire input corpus. | 
 |  """ | 
 |  self.corpus = corpus | 
 |  self.max_docs = max_docs | 
 |  | 
 |  def __iter__(self): | 
 |  return itertools.islice(self.corpus, self.max_docs) | 
 |  | 
 |  def __len__(self): | 
 |  return min(self.max_docs, len(self.corpus)) | 
 |  | 
 | def decode_htmlentities(text): | 
 |  """ | 
 |  Decode HTML entities in text, coded as hex, decimal or named. | 
 |  Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py | 
 |  >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' | 
 |  >>> print(decode_htmlentities(u).encode('UTF-8')) | 
 |  E tu vivrai nel terrore - L'aldilà (1981) | 
 |  >>> print(decode_htmlentities("l'eau")) | 
 |  l'eau | 
 |  >>> print(decode_htmlentities("foo < bar")) | 
 |  foo < bar | 
 |  """ | 
 |  def substitute_entity(match): | 
 |  ent = match.group(3) | 
 |  if match.group(1) == "#": | 
 |  # decoding by number | 
 |  if match.group(2) == '': | 
 |  # number is in decimal | 
 |  return unichr(int(ent)) | 
 |  elif match.group(2) == 'x': | 
 |  # number is in hex | 
 |  return unichr(int('0x' + ent, 16)) | 
 |  else: | 
 |  # they were using a name | 
 |  cp = n2cp.get(ent) | 
 |  if cp: | 
 |  return unichr(cp) | 
 |  else: | 
 |  return match.group() | 
 |  | 
 |  try: | 
 |  return RE_HTML_ENTITY.sub(substitute_entity, text) | 
 |  except: | 
 |  # in case of errors, return input | 
 |  # e.g., ValueError: unichr() arg not in range(0x10000) (narrow Python build) | 
 |  return text | 
 |  | 
 |  | 
 | def chunkize_serial(iterable, chunksize, as_numpy=False): | 
 |  """ | 
 |  Return elements from the iterable in `chunksize`-ed lists. The last returned | 
 |  element may be smaller (if length of collection is not divisible by `chunksize`). | 
 |  >>> print(list(grouper(range(10), 3))) | 
 |  [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] | 
 |  """ | 
 |  import numpy | 
 |  it = iter(iterable) | 
 |  while True: | 
 |  if as_numpy: | 
 |  # convert each document to a 2d numpy array (~6x faster when transmitting | 
 |  # chunk data over the wire, in Pyro) | 
 |  wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] | 
 |  else: | 
 |  wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] | 
 |  if not wrapped_chunk[0]: | 
 |  break | 
 |  # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference | 
 |  yield wrapped_chunk.pop() | 
 |  | 
 | grouper = chunkize_serial | 
 |  | 
 |  | 
 |  | 
 | class InputQueue(multiprocessing.Process): | 
 |  def __init__(self, q, corpus, chunksize, maxsize, as_numpy): | 
 |  super(InputQueue, self).__init__() | 
 |  self.q = q | 
 |  self.maxsize = maxsize | 
 |  self.corpus = corpus | 
 |  self.chunksize = chunksize | 
 |  self.as_numpy = as_numpy | 
 |  | 
 |  def run(self): | 
 |  if self.as_numpy: | 
 |  import numpy # don't clutter the global namespace with a dependency on numpy | 
 |  it = iter(self.corpus) | 
 |  while True: | 
 |  chunk = itertools.islice(it, self.chunksize) | 
 |  if self.as_numpy: | 
 |  # HACK XXX convert documents to numpy arrays, to save memory. | 
 |  # This also gives a scipy warning at runtime: | 
 |  # "UserWarning: indices array has non-integer dtype (float64)" | 
 |  wrapped_chunk = [[numpy.asarray(doc) for doc in chunk]] | 
 |  else: | 
 |  wrapped_chunk = [list(chunk)] | 
 |  | 
 |  if not wrapped_chunk[0]: | 
 |  self.q.put(None, block=True) | 
 |  break | 
 |  | 
 |  try: | 
 |  qsize = self.q.qsize() | 
 |  except NotImplementedError: | 
 |  qsize = '?' | 
 |  logger.debug("prepared another chunk of %i documents (qsize=%s)" % | 
 |  (len(wrapped_chunk[0]), qsize)) | 
 |  self.q.put(wrapped_chunk.pop(), block=True) | 
 | #endclass InputQueue | 
 |  | 
 |  | 
 | if os.name == 'nt': | 
 |  logger.info("detected Windows; aliasing chunkize to chunkize_serial") | 
 |  | 
 |  def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): | 
 |  for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): | 
 |  yield chunk | 
 | else: | 
 |  def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): | 
 |  """ | 
 |  Split a stream of values into smaller chunks. | 
 |  Each chunk is of length `chunksize`, except the last one which may be smaller. | 
 |  A once-only input stream (`corpus` from a generator) is ok, chunking is done | 
 |  efficiently via itertools. | 
 |  If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but | 
 |  rather keep filling a short queue (of size at most `maxsize`) with forthcoming | 
 |  chunks in advance. This is realized by starting a separate process, and is | 
 |  meant to reduce I/O delays, which can be significant when `corpus` comes | 
 |  from a slow medium (like harddisk). | 
 |  If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize | 
 |  via `chunkize_serial()` (no I/O optimizations). | 
 |  >>> for chunk in chunkize(range(10), 4): print(chunk) | 
 |  [0, 1, 2, 3] | 
 |  [4, 5, 6, 7] | 
 |  [8, 9] | 
 |  """ | 
 |  assert chunksize > 0 | 
 |  | 
 |  if maxsize > 0: | 
 |  q = multiprocessing.Queue(maxsize=maxsize) | 
 |  worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) | 
 |  worker.daemon = True | 
 |  worker.start() | 
 |  while True: | 
 |  chunk = [q.get(block=True)] | 
 |  if chunk[0] is None: | 
 |  break | 
 |  yield chunk.pop() | 
 |  else: | 
 |  for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): | 
 |  yield chunk | 
 |  | 
 |  | 
 | def make_closing(base, **attrs): | 
 |  """ | 
 |  Add support for `with Base(attrs) as fout:` to the base class if it's missing. | 
 |  The base class' `close()` method will be called on context exit, to always close the file properly. | 
 |  This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise | 
 |  raise "AttributeError: GzipFile instance has no attribute '__exit__'". | 
 |  """ | 
 |  if not hasattr(base, '__enter__'): | 
 |  attrs['__enter__'] = lambda self: self | 
 |  if not hasattr(base, '__exit__'): | 
 |  attrs['__exit__'] = lambda self, type, value, traceback: self.close() | 
 |  return type('Closing' + base.__name__, (base, object), attrs) | 
 |  | 
 |  | 
 | def smart_open(fname, mode='rb'): | 
 |  _, ext = os.path.splitext(fname) | 
 |  if ext == '.bz2': | 
 |  from bz2 import BZ2File | 
 |  return make_closing(BZ2File)(fname, mode) | 
 |  if ext == '.gz': | 
 |  from gzip import GzipFile | 
 |  return make_closing(GzipFile)(fname, mode) | 
 |  return open(fname, mode) | 
 |  | 
 |  | 
 | def pickle(obj, fname, protocol=-1): | 
 |  """Pickle object `obj` to file `fname`.""" | 
 |  with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows | 
 |  _pickle.dump(obj, fout, protocol=protocol) | 
 |  | 
 |  | 
 | def unpickle(fname): | 
 |  """Load pickled object from `fname`""" | 
 |  with smart_open(fname) as f: | 
 |  return _pickle.load(f) | 
 |  | 
 |  | 
 | def revdict(d): | 
 |  """ | 
 |  Reverse a dictionary mapping. | 
 |  When two keys map to the same value, only one of them will be kept in the | 
 |  result (which one is kept is arbitrary). | 
 |  """ | 
 |  return dict((v, k) for (k, v) in iteritems(d)) | 
 |  | 
 |  | 
 | def toptexts(query, texts, index, n=10): | 
 |  """ | 
 |  Debug fnc to help inspect the top `n` most similar documents (according to a | 
 |  similarity index `index`), to see if they are actually related to the query. | 
 |  `texts` is any object that can return something insightful for each document | 
 |  via `texts[docid]`, such as its fulltext or snippet. | 
 |  Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]). | 
 |  """ | 
 |  sims = index[query] # perform a similarity query against the corpus | 
 |  sims = sorted(enumerate(sims), key=lambda item: -item[1]) | 
 |  | 
 |  result = [] | 
 |  for topid, topcosine in sims[:n]: # only consider top-n most similar docs | 
 |  result.append((topid, topcosine, texts[topid])) | 
 |  return result | 
 |  | 
 |  | 
 | def randfname(prefix='gensim'): | 
 |  randpart = hex(random.randint(0, 0xffffff))[2:] | 
 |  return os.path.join(tempfile.gettempdir(), prefix + randpart) | 
 |  | 
 |  | 
 | def upload_chunked(server, docs, chunksize=1000, preprocess=None): | 
 |  """ | 
 |  Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). | 
 |  Use this function to train or index large collections -- avoid sending the | 
 |  entire corpus over the wire as a single Pyro in-memory object. The documents | 
 |  will be sent in smaller chunks, of `chunksize` documents each. | 
 |  """ | 
 |  start = 0 | 
 |  for chunk in grouper(docs, chunksize): | 
 |  end = start + len(chunk) | 
 |  logger.info("uploading documents %i-%i" % (start, end - 1)) | 
 |  if preprocess is not None: | 
 |  pchunk = [] | 
 |  for doc in chunk: | 
 |  doc['tokens'] = preprocess(doc['text']) | 
 |  del doc['text'] | 
 |  pchunk.append(doc) | 
 |  chunk = pchunk | 
 |  server.buffer(chunk) | 
 |  start = end | 
 |  | 
 |  | 
 | def getNS(): | 
 |  """ | 
 |  Return a Pyro name server proxy. If there is no name server running, | 
 |  start one on 0.0.0.0 (all interfaces), as a background process. | 
 |  """ | 
 |  import Pyro4 | 
 |  try: | 
 |  return Pyro4.locateNS() | 
 |  except Pyro4.errors.NamingError: | 
 |  logger.info("Pyro name server not found; starting a new one") | 
 |  os.system("python -m Pyro4.naming -n 0.0.0.0 &") | 
 |  # TODO: spawn a proper daemon ala http://code.activestate.com/recipes/278731/ ? | 
 |  # like this, if there's an error somewhere, we'll never know... (and the loop | 
 |  # below will block). And it probably doesn't work on windows, either. | 
 |  while True: | 
 |  try: | 
 |  return Pyro4.locateNS() | 
 |  except: | 
 |  pass | 
 |  | 
 |  | 
 | def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None): | 
 |  """ | 
 |  Register object with name server (starting the name server if not running | 
 |  yet) and block until the daemon is terminated. The object is registered under | 
 |  `name`, or `name`+ some random suffix if `random_suffix` is set. | 
 |  """ | 
 |  if random_suffix: | 
 |  name += '.' + hex(random.randint(0, 0xffffff))[2:] | 
 |  import Pyro4 | 
 |  with getNS() as ns: | 
 |  with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon: | 
 |  # register server for remote access | 
 |  uri = daemon.register(obj, name) | 
 |  ns.remove(name) | 
 |  ns.register(name, uri) | 
 |  logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) | 
 |  daemon.requestLoop() | 
 |  | 
 |  | 
 | if HAS_PATTERN: | 
 |  def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, stopwords=frozenset()): | 
 |  """ | 
 |  This function is only available when the optional 'pattern' package is installed. | 
 |  Use the English lemmatizer from `pattern` to extract tokens in | 
 |  their base form=lemma, e.g. "are, is, being" -> "be" etc. | 
 |  This is a smarter version of stemming, taking word context into account. | 
 |  Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). | 
 |  >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') | 
 |  ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] | 
 |  >>> lemmatize('The study ranks high.') | 
 |  ['study/NN', 'rank/VB', 'high/JJ'] | 
 |  >>> lemmatize('The ranks study hard.') | 
 |  ['rank/NN', 'study/VB', 'hard/RB'] | 
 |  """ | 
 |  if light: | 
 |  import warnings | 
 |  warnings.warn("The light flag is no longer supported by pattern.") | 
 |  | 
 |  # tokenization in `pattern` is weird; it gets thrown off by non-letters, | 
 |  # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little | 
 |  # FIXME this throws away all fancy parsing cues, including sentence structure, | 
 |  # abbreviations etc. | 
 |  content = u(' ').join(tokenize(content, lower=True, errors='ignore')) | 
 |  | 
 |  parsed = parse(content, lemmata=True, collapse=False) | 
 |  result = [] | 
 |  for sentence in parsed: | 
 |  for token, tag, _, _, lemma in sentence: | 
 |  if 2 <= len(lemma) <= 15 and not lemma.startswith('_') and lemma not in stopwords: | 
 |  if allowed_tags.match(tag): | 
 |  lemma += "/" + tag[:2] | 
 |  result.append(lemma.encode('utf8')) | 
 |  return result |