diff --git a/ge/models/__init__.py b/ge/models/__init__.py
index d2375e9..cd49858 100644
--- a/ge/models/__init__.py
+++ b/ge/models/__init__.py
@@ -3,6 +3,7 @@
from .line import LINE
from .sdne import SDNE
from .struc2vec import Struc2Vec
+from .bfswalk import BFSWalk
-__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec"]
+__all__ = ["DeepWalk", "Node2Vec", "LINE", "SDNE", "Struc2Vec", "BFSWalk"]
diff --git a/ge/models/alibaba-eges.py b/ge/models/alibaba-eges.py
new file mode 100644
index 0000000..b4ec89c
--- /dev/null
+++ b/ge/models/alibaba-eges.py
@@ -0,0 +1,67 @@
+# -*- coding:utf-8 -*-
+
+"""
+
+
+
+Author:
+
+ Chengliang Zhao, bruce.e.zhao@gmail.com
+
+
+
+Reference:
+
+ [1] Jizhe Wang, Pipei Huang, Huan Zhao, Zhibo Zhang, Binqiang Zhao, and Dik Lun Lee. 2018. Billion-scale Commodity Embedding for E-commerce Recommendation in Alibaba. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '18). Association for Computing Machinery, New York, NY, USA, 839–848. DOI:https://doi.org/10.1145/3219819.3219869
+
+
+"""
+from ..walker import RandomWalker
+from gensim.models import Word2Vec
+import pandas as pd
+import numpy as np
+
+
+class EGES:
+ def __init__(self, graph, walk_length, num_walks, workers=1):
+
+ self.graph = graph
+ self.w2v_model = None
+ self._embeddings = {}
+
+ self.walker = RandomWalker(
+ graph, p=1, q=1, )
+ self.sentences = self.walker.simulate_walks(
+ num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
+
+ def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
+
+ kwargs["sentences"] = self.sentences
+ kwargs["min_count"] = kwargs.get("min_count", 0)
+ kwargs["vector_size"] = embed_size
+ kwargs["sg"] = 1 # skip gram
+ kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
+ kwargs["workers"] = workers
+ kwargs["window"] = window_size
+ kwargs["epochs"] = iter
+
+ # print("Learning embedding vectors...")
+ # model = Word2Vec(**kwargs)
+ # print("Learning embedding vectors done!")
+
+ # self.w2v_model = model
+ # return model
+
+
+
+
+ def get_embeddings(self,):
+ if self.w2v_model is None:
+ print("model not train")
+ return {}
+
+ self._embeddings = {}
+ for word in self.graph.nodes():
+ self._embeddings[word] = self.w2v_model.wv[word]
+
+ return self._embeddings
diff --git a/ge/models/bfswalk.py b/ge/models/bfswalk.py
new file mode 100644
index 0000000..72155a6
--- /dev/null
+++ b/ge/models/bfswalk.py
@@ -0,0 +1,67 @@
+# -*- coding:utf-8 -*-
+
+"""
+
+
+
+Author:
+
+ Weichen Shen,wcshen1994@163.com
+
+
+
+Reference:
+
+ [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf)
+
+
+
+"""
+from ..walker import RandomWalker
+from gensim.models import Word2Vec, word2vec
+import pandas as pd
+
+
+class BFSWalk:
+ def __init__(self, graph, outlier, walk_length, num_walks, workers=1, weight = False):
+
+ self.graph = graph
+ self.w2v_model = None
+ self._embeddings = {}
+
+ self.walker = RandomWalker(
+ graph, p=1, q=1, )
+ self.sentences = self.walker.simulate_walks("bfs", outlier,
+ num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight = weight)
+
+ def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs=1, **kwargs):
+
+ kwargs["sentences"] = word2vec.Text8Corpus(walkfile)
+ kwargs["min_count"] = kwargs.get("min_count", 1)
+ kwargs["vector_size"] = embed_size
+ kwargs["sg"] = sg # skip gram
+ kwargs["hs"] = hs # deepwalk use Hierarchical Softmax
+ kwargs["workers"] = workers
+ kwargs["window"] = window_size
+ kwargs["epochs"] = iter
+
+ print("Learning embedding vectors...")
+ model = Word2Vec(**kwargs)
+ print("Learning embedding vectors done!")
+
+ self.w2v_model = model
+ return model
+
+ def get_embeddings(self,):
+ if self.w2v_model is None:
+ print("model not train")
+ return {}
+
+ self._embeddings = {}
+ for word in self.graph.nodes():
+ self._embeddings[word] = self.w2v_model.wv[word]
+
+ return self._embeddings
+
+ def get_sentences(self):
+ return self.sentences
\ No newline at end of file
diff --git a/ge/models/deepwalk.py b/ge/models/deepwalk.py
index d0fadc7..0d3386a 100644
--- a/ge/models/deepwalk.py
+++ b/ge/models/deepwalk.py
@@ -18,12 +18,12 @@
"""
from ..walker import RandomWalker
-from gensim.models import Word2Vec
+from gensim.models import Word2Vec, word2vec
import pandas as pd
class DeepWalk:
- def __init__(self, graph, walk_length, num_walks, workers=1):
+ def __init__(self, graph, outlier, walk_length, num_walks, workers=1, weight = False):
self.graph = graph
self.w2v_model = None
@@ -31,19 +31,19 @@ def __init__(self, graph, walk_length, num_walks, workers=1):
self.walker = RandomWalker(
graph, p=1, q=1, )
- self.sentences = self.walker.simulate_walks(
- num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
+ self.sentences = self.walker.simulate_walks("deep", outlier,
+ num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight = weight)
- def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
+ def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs=1, **kwargs):
- kwargs["sentences"] = self.sentences
- kwargs["min_count"] = kwargs.get("min_count", 0)
- kwargs["size"] = embed_size
- kwargs["sg"] = 1 # skip gram
- kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
+ kwargs["sentences"] = word2vec.Text8Corpus(walkfile)
+ kwargs["min_count"] = kwargs.get("min_count", 1)
+ kwargs["vector_size"] = embed_size
+ kwargs["sg"] = sg # skip gram
+ kwargs["hs"] = hs # deepwalk use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
- kwargs["iter"] = iter
+ kwargs["epochs"] = iter
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
@@ -62,3 +62,6 @@ def get_embeddings(self,):
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
+
+ def get_sentences(self):
+ return self.sentences
\ No newline at end of file
diff --git a/ge/models/line.py b/ge/models/line.py
index 04c5073..96de8b0 100644
--- a/ge/models/line.py
+++ b/ge/models/line.py
@@ -92,7 +92,7 @@ def __init__(self, graph, embedding_size=8, negative_ratio=5, order='second',):
self.node_size = graph.number_of_nodes()
self.edge_size = graph.number_of_edges()
self.samples_per_epoch = self.edge_size*(1+negative_ratio)
-
+
self._gen_sampling_table()
self.reset_model()
diff --git a/ge/models/node2vec.py b/ge/models/node2vec.py
index 16f86cb..1e0c6a8 100644
--- a/ge/models/node2vec.py
+++ b/ge/models/node2vec.py
@@ -18,7 +18,7 @@
"""
-from gensim.models import Word2Vec
+from gensim.models import Word2Vec, word2vec
import pandas as pd
from ..walker import RandomWalker
@@ -26,7 +26,7 @@
class Node2Vec:
- def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
+ def __init__(self, graph, outlier, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):
self.graph = graph
self._embeddings = {}
@@ -36,19 +36,19 @@ def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_r
print("Preprocess transition probs...")
self.walker.preprocess_transition_probs()
- self.sentences = self.walker.simulate_walks(
- num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
+ self.sentences = self.walker.simulate_walks("node", outlier,
+ num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1, weight=False)
- def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
+ def train(self, walkfile, embed_size=128, window_size=5, workers=3, iter=5, sg = 1, hs = 1, **kwargs):
- kwargs["sentences"] = self.sentences
- kwargs["min_count"] = kwargs.get("min_count", 0)
- kwargs["size"] = embed_size
- kwargs["sg"] = 1
- kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax
+ kwargs["sentences"] = word2vec.Text8Corpus(walkfile)
+ kwargs["min_count"] = kwargs.get("min_count", 1)
+ kwargs["vector_size"] = embed_size
+ kwargs["sg"] = sg
+ kwargs["hs"] = hs # node2vec not use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
- kwargs["iter"] = iter
+ kwargs["epochs"] = iter
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
@@ -68,3 +68,6 @@ def get_embeddings(self,):
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
+
+ def get_sentences(self):
+ return self.sentences
\ No newline at end of file
diff --git a/ge/models/struc2vec.py b/ge/models/struc2vec.py
index 4040562..e637928 100644
--- a/ge/models/struc2vec.py
+++ b/ge/models/struc2vec.py
@@ -112,8 +112,8 @@ def train(self, embed_size=128, window_size=5, workers=3, iter=5):
sentences = self.sentences
print("Learning representation...")
- model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
- iter=iter)
+ model = Word2Vec(sentences, vector_size=embed_size, window=window_size, min_count=0, hs=1, sg=1, workers=workers,
+ epochs=iter)
print("Learning representation done!")
self.w2v_model = model
diff --git a/ge/walker.py b/ge/walker.py
index 7266585..e3408c5 100644
--- a/ge/walker.py
+++ b/ge/walker.py
@@ -35,7 +35,47 @@ def deepwalk_walk(self, walk_length, start_node):
walk.append(random.choice(cur_nbrs))
else:
break
- return walk
+
+ return " ".join(walk)
+
+ def bfs_walk(self, walk_length, start_node):
+
+ walk = [start_node]
+
+ # while len(walk) < walk_length:
+ cur = walk[-1]
+ cur_nbrs = list(self.G.neighbors(cur))
+ if len(cur_nbrs) > 0:
+ l = len(cur_nbrs)
+ ranlist = random.sample(range(0,l),l)
+ for i in ranlist:
+ walk.append(cur_nbrs[i])
+
+ return " ".join(walk)
+
+ def deepwalk_walk_weighted(self, walk_length, start_node):
+
+ walk = [start_node]
+
+ while len(walk) < walk_length:
+ cur = walk[-1]
+ cur_nbrs = list(self.G.neighbors(cur))
+ if len(cur_nbrs) > 0:
+ p = self.chose_node_p(cur, cur_nbrs)
+ walk.append(random.choice(cur_nbrs, p = p))
+ else:
+ break
+ return " ".join(walk)
+
+ # 根据边的权重,计算每个edge被选择的概率
+ def chose_node_p(self, cur, nbrs):
+ # 计算每一个位置被选择的概率,返回概率
+ weight = []
+ for i in nbrs:
+ weight.append(self.G[cur][i]["weight"])
+
+ total = sum(weight)
+ return [i/total for i in weight]
def node2vec_walk(self, walk_length, start_node):
@@ -61,7 +101,7 @@ def node2vec_walk(self, walk_length, start_node):
else:
break
- return walk
+ return " ".join(walk)
def node2vec_walk2(self, walk_length, start_node):
"""
@@ -114,36 +154,50 @@ def rejection_sample(inv_p, inv_q, nbrs_num):
walk.append(next_node)
else:
break
- return walk
-
- def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
+ return " ".join(walk)
+ def simulate_walks(self, method, outlier, num_walks, walk_length, workers=1, verbose=0, weight = False):
G = self.G
nodes = list(G.nodes())
-
- results = Parallel(n_jobs=workers, verbose=verbose, )(
- delayed(self._simulate_walks)(nodes, num, walk_length) for num in
- partition_num(num_walks, workers))
-
- walks = list(itertools.chain(*results))
+ print(len(nodes))
+ nodes = [i for i in nodes if G.out_degree(i)]
+ nodes.extend(outlier)
+ random.shuffle(nodes)
+ print(len(nodes))
+ # results = Parallel(n_jobs=workers, verbose=verbose, )(
+ # delayed(self._simulate_walks)(method, nodes, num, walk_length, weight) for num in
+ # partition_num(num_walks, workers))
+
+ # walks = list(itertools.chain(*results))
+ walks = []
+ self._simulate_walks(walks, method, nodes, num_walks, walk_length, weight)
return walks
- def _simulate_walks(self, nodes, num_walks, walk_length,):
- walks = []
- for _ in range(num_walks):
- random.shuffle(nodes)
- for v in nodes:
- if self.p == 1 and self.q == 1:
- walks.append(self.deepwalk_walk(
- walk_length=walk_length, start_node=v))
- elif self.use_rejection_sampling:
- walks.append(self.node2vec_walk2(
- walk_length=walk_length, start_node=v))
+ def _simulate_walks(self, walks, method, nodes, num_walks, walk_length, weight = False):
+ # walks = []
+ for v in nodes:
+ for _ in range(num_walks):
+ # random.shuffle(nodes)
+ if method == "deep":
+ if weight:
+ walks.append(self.deepwalk_walk_weighted(
+ walk_length=walk_length, start_node=v))
+ else:
+ walks.append(self.deepwalk_walk(
+ walk_length=walk_length, start_node=v))
+ elif method == "bfs":
+ walks.append(self.bfs_walk(walk_length=walk_length, start_node=v))
+ elif method == "node":
+ if self.use_rejection_sampling:
+ walks.append(self.node2vec_walk2(
+ walk_length=walk_length, start_node=v))
+ else:
+ walks.append(self.node2vec_walk(
+ walk_length=walk_length, start_node=v))
else:
- walks.append(self.node2vec_walk(
- walk_length=walk_length, start_node=v))
+ pass
return walks
def get_alias_edge(self, t, v):
diff --git a/setup.py b/setup.py
index 38a4235..1843939 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
import setuptools
-with open("README.md", "r") as fh:
+with open("README.md", "r", encoding='utf-8') as fh:
long_description = fh.read()