Source code for d6tjoin.utils

from collections import OrderedDict

import pandas as pd
pd.set_option('display.expand_frame_repr', False)
import numpy as np

# ******************************************
# helpers
# ******************************************
def _set_values_series(dfs):
    return set(dfs[~pd.isnull(dfs)])

def _set_values(dfg, key):
    return _set_values_series(dfg[key])

def _filter_group_min(dfg, col, topn=1):
    """

    Returns all rows equal to min in col

    """
    if topn==1:
        return dfg[dfg[col] == dfg[col].min()]
    else:
        return dfg[dfg[col].isin(np.sort(dfg[col].unique())[:topn])]

from joblib import Parallel, delayed
import multiprocessing
def _applyFunMulticore(values1, values2, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(p[0],p[1]) for p in zip(values1,values2))
    return retLst


# ******************************************
# tfidf
# ******************************************
import re
import collections
from joblib import Parallel, delayed
import multiprocessing
import itertools
import warnings

[docs]def tokenCount(dfs, fun, mincount=2, minlength=1):
    """
    Tokenize a series of strings and count occurance of string tokens

    Args:
        dfs (pd.series): pd.series of values
        fun (function): tokenize function
        mincount (int): discard tokens with count less than mincount
        minlength (int): discard tokens with string length less than minlength

    Returns:
        dataframe: count of tokens

    """
    assert len(dfs.shape)==1
    dfs=dfs.dropna().unique()
    
    if dfs.shape[0]>1000:
        words = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(fun)(s) for s in dfs)
    else:
        words = [fun(s) for s in dfs]
    words = list(itertools.chain.from_iterable(words))    
    df_count = [t for t in collections.Counter(words).most_common() if t[1]>=mincount and len(t[0])>=minlength]
    df_count = pd.DataFrame(df_count, columns=['word','count'])
    return df_count

[docs]def splitcharTokenCount(dfs, splitchars="[^a-zA-Z0-9]+", mincount=2, minlength=1): #"[ -_|]+"
    """
    Tokenize a series of strings by splitting strings on a set of characters. Then count occurance of tokens in series.

    Args:
        dfs (pd.series): pd.series of values
        splitchars (str): regex by which to split string into tokens. For example `"[^a-zA-Z0-9]+"` for anything not alpha-numeric or `"[ -_|]+"` for common ID tokens.
        mincount (int): discard tokens with count less than mincount
        minlength (int): discard tokens with string length less than minlength

    Returns:
        dataframe: count of tokens

    """
    def funsplit(s):
        return re.split(splitchars,s)
    return tokenCount(dfs, funsplit, mincount, minlength)

[docs]def ncharTokenCount(dfs, nchars=None, overlapping=False, mincount=2, minlength=1):
    """
    Tokenize a series of strings by splitting strings into tokens of `nchars` length. Then count occurance of tokens in series.

    Args:
        dfs (pd.series): pd.series of values
        nchars (int): number of characters in each token
        overlapping (bool): make overlapping tokens
        mincount (int): discard tokens with count less than mincount
        minlength (int): discard tokens with string length less than minlength

    Returns:
        dataframe: count of tokens

    """
    if not nchars:
        smax = dfs.str.len().max()
        smin = dfs.str.len().min()
        if smax-smin>2:
            warnings.warn('Tokenize works best if strings have similar length')
        nchars = dfs.str.len().max()//4

    if overlapping:
        def funtokenize(s):
            return [s[i:i+nchars] for i in range(0, len(s)-nchars+1)]
    else:
        def funtokenize(s):
            return [s[i:i+nchars] for i in range(0, len(s), nchars)]
    return tokenCount(dfs, funtokenize, mincount, minlength)


[docs]def unique_contains(dfs, strlist):
    """
    Find values which contain a set of substrings

    Args:
        dfs (pd.series): pd.series of values
        strlist (list): substrings to find

    Returns:
        list: unique values which contain substring

    """
    assert len(dfs.shape)==1
    dfs=np.unique(dfs)
    outlist = [(x, [s for s in dfs if x in s]) for x in strlist]
    return outlist

import collections

[docs]def typeSeries(dfs):
    """
    Find type of a pandas series

    Args:
        dfs (pd.series): pd.series of values

    Returns:
        str: type

    """
    c = collections.Counter([type(x) for x in dfs.values])
    cnt = c.most_common()
    if len(cnt)>1:
        return 'mixed'
    else:
        return cnt[0][0]

[docs]def typeDataFrame(df):
    """
    Find type of a pandas dataframe columns

    Args:
        df (pd.dataframe): pandas dataframe

    Returns:
        dict: column, type

    """
    return dict(zip(df.columns,[typeSeries(df[s]) for s in df]))