from collections import OrderedDict
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
import numpy as np
# ******************************************
# helpers
# ******************************************
def _set_values_series(dfs):
return set(dfs[~pd.isnull(dfs)])
def _set_values(dfg, key):
return _set_values_series(dfg[key])
def _filter_group_min(dfg, col, topn=1):
"""
Returns all rows equal to min in col
"""
if topn==1:
return dfg[dfg[col] == dfg[col].min()]
else:
return dfg[dfg[col].isin(np.sort(dfg[col].unique())[:topn])]
from joblib import Parallel, delayed
import multiprocessing
def _applyFunMulticore(values1, values2, func):
retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(p[0],p[1]) for p in zip(values1,values2))
return retLst
# ******************************************
# tfidf
# ******************************************
import re
import collections
from joblib import Parallel, delayed
import multiprocessing
import itertools
import warnings
[docs]def tokenCount(dfs, fun, mincount=2, minlength=1):
"""
Tokenize a series of strings and count occurance of string tokens
Args:
dfs (pd.series): pd.series of values
fun (function): tokenize function
mincount (int): discard tokens with count less than mincount
minlength (int): discard tokens with string length less than minlength
Returns:
dataframe: count of tokens
"""
assert len(dfs.shape)==1
dfs=dfs.dropna().unique()
if dfs.shape[0]>1000:
words = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(fun)(s) for s in dfs)
else:
words = [fun(s) for s in dfs]
words = list(itertools.chain.from_iterable(words))
df_count = [t for t in collections.Counter(words).most_common() if t[1]>=mincount and len(t[0])>=minlength]
df_count = pd.DataFrame(df_count, columns=['word','count'])
return df_count
[docs]def splitcharTokenCount(dfs, splitchars="[^a-zA-Z0-9]+", mincount=2, minlength=1): #"[ -_|]+"
"""
Tokenize a series of strings by splitting strings on a set of characters. Then count occurance of tokens in series.
Args:
dfs (pd.series): pd.series of values
splitchars (str): regex by which to split string into tokens. For example `"[^a-zA-Z0-9]+"` for anything not alpha-numeric or `"[ -_|]+"` for common ID tokens.
mincount (int): discard tokens with count less than mincount
minlength (int): discard tokens with string length less than minlength
Returns:
dataframe: count of tokens
"""
def funsplit(s):
return re.split(splitchars,s)
return tokenCount(dfs, funsplit, mincount, minlength)
[docs]def ncharTokenCount(dfs, nchars=None, overlapping=False, mincount=2, minlength=1):
"""
Tokenize a series of strings by splitting strings into tokens of `nchars` length. Then count occurance of tokens in series.
Args:
dfs (pd.series): pd.series of values
nchars (int): number of characters in each token
overlapping (bool): make overlapping tokens
mincount (int): discard tokens with count less than mincount
minlength (int): discard tokens with string length less than minlength
Returns:
dataframe: count of tokens
"""
if not nchars:
smax = dfs.str.len().max()
smin = dfs.str.len().min()
if smax-smin>2:
warnings.warn('Tokenize works best if strings have similar length')
nchars = dfs.str.len().max()//4
if overlapping:
def funtokenize(s):
return [s[i:i+nchars] for i in range(0, len(s)-nchars+1)]
else:
def funtokenize(s):
return [s[i:i+nchars] for i in range(0, len(s), nchars)]
return tokenCount(dfs, funtokenize, mincount, minlength)
[docs]def unique_contains(dfs, strlist):
"""
Find values which contain a set of substrings
Args:
dfs (pd.series): pd.series of values
strlist (list): substrings to find
Returns:
list: unique values which contain substring
"""
assert len(dfs.shape)==1
dfs=np.unique(dfs)
outlist = [(x, [s for s in dfs if x in s]) for x in strlist]
return outlist
import collections
[docs]def typeSeries(dfs):
"""
Find type of a pandas series
Args:
dfs (pd.series): pd.series of values
Returns:
str: type
"""
c = collections.Counter([type(x) for x in dfs.values])
cnt = c.most_common()
if len(cnt)>1:
return 'mixed'
else:
return cnt[0][0]
[docs]def typeDataFrame(df):
"""
Find type of a pandas dataframe columns
Args:
df (pd.dataframe): pandas dataframe
Returns:
dict: column, type
"""
return dict(zip(df.columns,[typeSeries(df[s]) for s in df]))