import itertools from StreamhashProjection import StreamhashProjection import numpy as np import tqdm
tqdm.tqdm.monitor_interval = 0
class Chain: def __init__(self, deltamax, depth=25): """单条链的初始化
Args: deltamax (int): 特征范围 depth (int, optional): 层级. Defaults to 25. """ k = len(deltamax) self.deltamax = deltamax self.depth = depth self.fs = [ np.random.randint(0, k) for d in range(depth) ] self.cmsketches = [None] * depth self.shift = np.random.rand(k) * deltamax
def fit(self, X, verbose=False, update=False): prebins = np.zeros(X.shape, dtype=np.float) depthcount = np.zeros(len(self.deltamax), dtype=np.int) for depth in range(self.depth): f = self.fs[depth] depthcount[f] += 1
if depthcount[f] == 1: prebins[:, f] = (X[:, f] + self.shift[f]) / self.deltamax[f] else: prebins[:, f] = ( 2.0 * prebins[:, f] - self.shift[f] / self.deltamax[f] )
if update: cmsketch = self.cmsketches[depth] else: cmsketch = {} for prebin in prebins: l = tuple(np.floor(prebin).astype(np.int)) if not l in cmsketch: cmsketch[l] = 0 cmsketch[l] += 1 self.cmsketches[depth] = cmsketch return self
def bincount(self, X): """计算每个层级下,对应特征(保存至组内)的统计次数
Args: X ([type]): 数据集
Returns: [type]: 每个被选择的特征的次数统计 """ scores = np.zeros((X.shape[0], self.depth)) prebins = np.zeros(X.shape, dtype=np.float) depthcount = np.zeros(len(self.deltamax), dtype=np.int) for depth in range(self.depth): f = self.fs[depth] depthcount[f] += 1
if depthcount[f] == 1: prebins[:, f] = (X[:, f] + self.shift[f]) / self.deltamax[f] else: prebins[:, f] = 2.0 * prebins[:, f] - self.shift[f] / self.deltamax[f]
cmsketch = self.cmsketches[depth] for i, prebin in enumerate(prebins): l = tuple(np.floor(prebin).astype(np.int)) if not l in cmsketch: scores[i, depth] = 0.0 else: scores[i, depth] = cmsketch[l]
return scores
def score(self, X, adjusted=False): scores = self.bincount(X) depths = np.array([d for d in range(1, self.depth + 1)]) scores = np.log2(1.0 + scores) + depths return np.min(scores, axis=1)
class Chains: def __init__(self, k=50, nchains=100, depth=25, seed=42): """所有链的初始化
Args: k (int, optional): 最大特征个数. Defaults to 50. nchains (int, optional): 链的个数. Defaults to 100. depth (int, optional): 深度,对应论文中的level. Defaults to 25. seed (int, optional): 随机数种子. Defaults to 42. """ self.nchains = nchains self.depth = depth self.chains = [] self.projector = StreamhashProjection( n_components=k, density=1 / 3.0, random_state=seed )
def fit(self, X): """fit函数
Args: X ([type]): 数据集 """ projected_X = self.projector.fit_transform(X) deltamax = np.ptp(projected_X, axis=0) / 2.0 deltamax[deltamax == 0] = 1.0 for i in tqdm.tqdm(range(self.nchains), desc="Fitting..."): c = Chain(deltamax, depth=self.depth) c.fit(projected_X) self.chains.append(c)
def score(self, X, adjusted=False): """score函数
Args: X ([type]): 数据集 adjusted (bool, optional): 这部分代码没有用到这个变量. Defaults to False.
Returns: [type]: 打分结果 """ projected_X = self.projector.transform(X) scores = np.zeros(X.shape[0]) for i in tqdm.tqdm(range(self.nchains), desc="Scoring..."): chain = self.chains[i] scores += chain.score(projected_X, adjusted) scores /= float(self.nchains) return scores
|