diff --git a/ge/models/__init__.py b/ge/models/__init__.py index d2375e9..cd49858 100644 --- a/ge/models/__init__.py +++ b/ge/models/__init__.py @@ -3,6 +3,7 @@ from .line import LINE from .sdne import SDNE from .struc2vec import Struc2Vec +from .bfswalk import BFSWalk -__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"] +__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec", "BFSWalk"] diff --git a/ge/models/alibaba-eges.py b/ge/models/alibaba-eges.py new file mode 100644 index 0000000..b4ec89c --- /dev/null +++ b/ge/models/alibaba-eges.py @@ -0,0 +1,67 @@ +# -*- coding:utf-8 -*- + +""" + + + +Author: + + Chengliang Zhao, bruce.e.zhao@gmail.com + + + +Reference: + + [1] Jizhe Wang, Pipei Huang, Huan Zhao, Zhibo Zhang, Binqiang Zhao, and Dik Lun Lee. 2018. Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '18). Association for Computing Machinery, New York, NY, USA, 839–848. DOI:https://doi.org/10.1145/3219819.3219869 + + +""" +from ..walker import RandomWalker +from gensim.models import Word2Vec +import pandas as pd +import numpy as np + + +class EGES: + def __init__(self, graph, walk_length, num_walks, workers=1): + + self.graph = graph + self.w2v_model = None + self._embeddings = {} + + self.walker = RandomWalker( + graph, p=1, q=1, ) + self.sentences = self.walker.simulate_walks( + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) + + def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + + kwargs["sentences"] = self.sentences + kwargs["min_count"] = kwargs.get("min_count", 0) + kwargs["vector_size"] = embed_size + kwargs["sg"] = 1 # skip gram + kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax + kwargs["workers"] = workers + kwargs["window"] = window_size + kwargs["epochs"] = iter + + # print("Learning embedding vectors...") + # model = Word2Vec(**kwargs) + # print("Learning embedding vectors done!") + + # self.w2v_model = model + # return model + + + + + def get_embeddings(self,): + if self.w2v_model is None: + print("model not train") + return {} + + self._embeddings = {} + for word in self.graph.nodes(): + self._embeddings[word] = self.w2v_model.wv[word] + + return self._embeddings diff --git a/ge/models/bfswalk.py b/ge/models/bfswalk.py new file mode 100644 index 0000000..72155a6 --- /dev/null +++ b/ge/models/bfswalk.py @@ -0,0 +1,67 @@ +# -*- coding:utf-8 -*- + +""" + + + +Author: + + Weichen Shen,wcshen1994@163.com + + + +Reference: + + [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) + + + +""" +from ..walker import RandomWalker +from gensim.models import Word2Vec, word2vec +import pandas as pd + + +class BFSWalk: + def __init__(self, graph, outlier, walk_length, num_walks, workers=1, weight = False): + + self.graph = graph + self.w2v_model = None + self._embeddings = {} + + self.walker = RandomWalker( + graph, p=1, q=1, ) + self.sentences = self.walker.simulate_walks("bfs", outlier, + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight = weight) + + def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs=1, **kwargs): + + kwargs["sentences"] = word2vec.Text8Corpus(walkfile) + kwargs["min_count"] = kwargs.get("min_count", 1) + kwargs["vector_size"] = embed_size + kwargs["sg"] = sg # skip gram + kwargs["hs"] = hs # deepwalk use Hierarchical Softmax + kwargs["workers"] = workers + kwargs["window"] = window_size + kwargs["epochs"] = iter + + print("Learning embedding vectors...") + model = Word2Vec(**kwargs) + print("Learning embedding vectors done!") + + self.w2v_model = model + return model + + def get_embeddings(self,): + if self.w2v_model is None: + print("model not train") + return {} + + self._embeddings = {} + for word in self.graph.nodes(): + self._embeddings[word] = self.w2v_model.wv[word] + + return self._embeddings + + def get_sentences(self): + return self.sentences \ No newline at end of file diff --git a/ge/models/deepwalk.py b/ge/models/deepwalk.py index d0fadc7..0d3386a 100644 --- a/ge/models/deepwalk.py +++ b/ge/models/deepwalk.py @@ -18,12 +18,12 @@ """ from ..walker import RandomWalker -from gensim.models import Word2Vec +from gensim.models import Word2Vec, word2vec import pandas as pd class DeepWalk: - def __init__(self, graph, walk_length, num_walks, workers=1): + def __init__(self, graph, outlier, walk_length, num_walks, workers=1, weight = False): self.graph = graph self.w2v_model = None @@ -31,19 +31,19 @@ def __init__(self, graph, walk_length, num_walks, workers=1): self.walker = RandomWalker( graph, p=1, q=1, ) - self.sentences = self.walker.simulate_walks( - num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) + self.sentences = self.walker.simulate_walks("deep", outlier, + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight = weight) - def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs=1, **kwargs): - kwargs["sentences"] = self.sentences - kwargs["min_count"] = kwargs.get("min_count", 0) - kwargs["size"] = embed_size - kwargs["sg"] = 1 # skip gram - kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax + kwargs["sentences"] = word2vec.Text8Corpus(walkfile) + kwargs["min_count"] = kwargs.get("min_count", 1) + kwargs["vector_size"] = embed_size + kwargs["sg"] = sg # skip gram + kwargs["hs"] = hs # deepwalk use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size - kwargs["iter"] = iter + kwargs["epochs"] = iter print("Learning embedding vectors...") model = Word2Vec(**kwargs) @@ -62,3 +62,6 @@ def get_embeddings(self,): self._embeddings[word] = self.w2v_model.wv[word] return self._embeddings + + def get_sentences(self): + return self.sentences \ No newline at end of file diff --git a/ge/models/line.py b/ge/models/line.py index 04c5073..96de8b0 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -92,7 +92,7 @@ def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',): self.node_size = graph.number_of_nodes() self.edge_size = graph.number_of_edges() self.samples_per_epoch = self.edge_size*(1+negative_ratio) - + self._gen_sampling_table() self.reset_model() diff --git a/ge/models/node2vec.py b/ge/models/node2vec.py index 16f86cb..1e0c6a8 100644 --- a/ge/models/node2vec.py +++ b/ge/models/node2vec.py @@ -18,7 +18,7 @@ """ -from gensim.models import Word2Vec +from gensim.models import Word2Vec, word2vec import pandas as pd from ..walker import RandomWalker @@ -26,7 +26,7 @@ class Node2Vec: - def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0): + def __init__(self, graph, outlier, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0): self.graph = graph self._embeddings = {} @@ -36,19 +36,19 @@ def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_r print("Preprocess transition probs...") self.walker.preprocess_transition_probs() - self.sentences = self.walker.simulate_walks( - num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) + self.sentences = self.walker.simulate_walks("node", outlier, + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight=False) - def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs = 1, **kwargs): - kwargs["sentences"] = self.sentences - kwargs["min_count"] = kwargs.get("min_count", 0) - kwargs["size"] = embed_size - kwargs["sg"] = 1 - kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax + kwargs["sentences"] = word2vec.Text8Corpus(walkfile) + kwargs["min_count"] = kwargs.get("min_count", 1) + kwargs["vector_size"] = embed_size + kwargs["sg"] = sg + kwargs["hs"] = hs # node2vec not use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size - kwargs["iter"] = iter + kwargs["epochs"] = iter print("Learning embedding vectors...") model = Word2Vec(**kwargs) @@ -68,3 +68,6 @@ def get_embeddings(self,): self._embeddings[word] = self.w2v_model.wv[word] return self._embeddings + + def get_sentences(self): + return self.sentences \ No newline at end of file diff --git a/ge/models/struc2vec.py b/ge/models/struc2vec.py index 4040562..e637928 100644 --- a/ge/models/struc2vec.py +++ b/ge/models/struc2vec.py @@ -112,8 +112,8 @@ def train(self, embed_size=128, window_size=5, workers=3, iter=5): sentences = self.sentences print("Learning representation...") - model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers, - iter=iter) + model = Word2Vec(sentences, vector_size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers, + epochs=iter) print("Learning representation done!") self.w2v_model = model diff --git a/ge/walker.py b/ge/walker.py index 7266585..e3408c5 100644 --- a/ge/walker.py +++ b/ge/walker.py @@ -35,7 +35,47 @@ def deepwalk_walk(self, walk_length, start_node): walk.append(random.choice(cur_nbrs)) else: break - return walk + + return " ".join(walk) + + def bfs_walk(self, walk_length, start_node): + + walk = [start_node] + + # while len(walk) < walk_length: + cur = walk[-1] + cur_nbrs = list(self.G.neighbors(cur)) + if len(cur_nbrs) > 0: + l = len(cur_nbrs) + ranlist = random.sample(range(0,l),l) + for i in ranlist: + walk.append(cur_nbrs[i]) + + return " ".join(walk) + + def deepwalk_walk_weighted(self, walk_length, start_node): + + walk = [start_node] + + while len(walk) < walk_length: + cur = walk[-1] + cur_nbrs = list(self.G.neighbors(cur)) + if len(cur_nbrs) > 0: + p = self.chose_node_p(cur, cur_nbrs) + walk.append(random.choice(cur_nbrs, p = p)) + else: + break + return " ".join(walk) + + # 根据边的权重,计算每个edge被选择的概率 + def chose_node_p(self, cur, nbrs): + # 计算每一个位置被选择的概率,返回概率 + weight = [] + for i in nbrs: + weight.append(self.G[cur][i]["weight"]) + + total = sum(weight) + return [i/total for i in weight] def node2vec_walk(self, walk_length, start_node): @@ -61,7 +101,7 @@ def node2vec_walk(self, walk_length, start_node): else: break - return walk + return " ".join(walk) def node2vec_walk2(self, walk_length, start_node): """ @@ -114,36 +154,50 @@ def rejection_sample(inv_p, inv_q, nbrs_num): walk.append(next_node) else: break - return walk - - def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0): + return " ".join(walk) + def simulate_walks(self, method, outlier, num_walks, walk_length, workers=1, verbose=0, weight = False): G = self.G nodes = list(G.nodes()) - - results = Parallel(n_jobs=workers, verbose=verbose, )( - delayed(self._simulate_walks)(nodes, num, walk_length) for num in - partition_num(num_walks, workers)) - - walks = list(itertools.chain(*results)) + print(len(nodes)) + nodes = [i for i in nodes if G.out_degree(i)] + nodes.extend(outlier) + random.shuffle(nodes) + print(len(nodes)) + # results = Parallel(n_jobs=workers, verbose=verbose, )( + # delayed(self._simulate_walks)(method, nodes, num, walk_length, weight) for num in + # partition_num(num_walks, workers)) + + # walks = list(itertools.chain(*results)) + walks = [] + self._simulate_walks(walks, method, nodes, num_walks, walk_length, weight) return walks - def _simulate_walks(self, nodes, num_walks, walk_length,): - walks = [] - for _ in range(num_walks): - random.shuffle(nodes) - for v in nodes: - if self.p == 1 and self.q == 1: - walks.append(self.deepwalk_walk( - walk_length=walk_length, start_node=v)) - elif self.use_rejection_sampling: - walks.append(self.node2vec_walk2( - walk_length=walk_length, start_node=v)) + def _simulate_walks(self, walks, method, nodes, num_walks, walk_length, weight = False): + # walks = [] + for v in nodes: + for _ in range(num_walks): + # random.shuffle(nodes) + if method == "deep": + if weight: + walks.append(self.deepwalk_walk_weighted( + walk_length=walk_length, start_node=v)) + else: + walks.append(self.deepwalk_walk( + walk_length=walk_length, start_node=v)) + elif method == "bfs": + walks.append(self.bfs_walk(walk_length=walk_length, start_node=v)) + elif method == "node": + if self.use_rejection_sampling: + walks.append(self.node2vec_walk2( + walk_length=walk_length, start_node=v)) + else: + walks.append(self.node2vec_walk( + walk_length=walk_length, start_node=v)) else: - walks.append(self.node2vec_walk( - walk_length=walk_length, start_node=v)) + pass return walks def get_alias_edge(self, t, v): diff --git a/setup.py b/setup.py index 38a4235..1843939 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ import setuptools -with open("README.md", "r") as fh: +with open("README.md", "r", encoding='utf-8') as fh: long_description = fh.read()