diff --git a/.gitignore b/.gitignore
index 5c8fe09..491c84c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,9 +18,3 @@
 # jupyter notebooks
 *.ipynb_checkpoints/
 
-# virtualenv
-role2vec/*
-
-# python libraries
-ast2vec/*
-vecino/*
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..eca0afa
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,42 @@
+language: python
+sudo: false
+dist: trusty
+services:
+- docker
+cache:
+  directories:
+  - "$HOME/.cache/pip"
+addons:
+  apt:
+    packages:
+      - libboost-all-dev
+      - libxml2-dev
+_install: &_install
+  - gimme 1.8
+  - source ~/.gimme/envs/latest.env
+  - pip install --upgrade pip
+  - pip install -r requirements.txt codecov
+  - pip install -e .
+_coverage: &_coverage
+  - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
+matrix:
+  include:
+    - python: 3.4
+      env: *_coverage
+      install: *_install
+    - python: 3.5
+      env: *_coverage
+      install: *_install
+    - python: 3.6
+      env: SCRIPT="pep8 --max-line-length=99 ."
+      install: pip install pep8
+    - python: 3.6
+      env: *_coverage
+      install: *_install
+      after_success:
+        - codecov
+  fast_finish: true
+script:
+- (eval "$SCRIPT")
+notifications:
+  email: false
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1d6a773
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+ast2vec[tf]>=0.3.4-alpha
+scikit-learn>=0.19.0
\ No newline at end of file
diff --git a/role2vec/__init__.py b/role2vec/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/role2vec/__main__.py b/role2vec/__main__.py
new file mode 100644
index 0000000..fedcf49
--- /dev/null
+++ b/role2vec/__main__.py
@@ -0,0 +1,114 @@
+import argparse
+import logging
+import sys
+
+from ast2vec.__main__ import ArgumentDefaultsHelpFormatterNoNone, one_arg_parser
+from modelforge.logs import setup_logging
+from role2vec.glove import glove_entry
+from role2vec.node2vec import node2vec_entry
+from role2vec.stats import stats_entry
+from role2vec.vocab import vocab_entry
+from role2vec.roles.base import ROLES_MODELS, roles_entry
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Create main parser.
+
+    :return: Parser
+    """
+    parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone)
+    parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel,
+                        help="Logging verbosity.")
+
+    # Create all common arguments
+
+    process_arg = one_arg_parser("--processes", type=int, default=2, help="Number of processes.")
+    vocab_arg = one_arg_parser("--vocabulary", default="vocab.txt", help="File with vocabulary.")
+    uast_input_arg = one_arg_parser("input", help="Input file with UASTs.")
+
+    # Construct subparsers
+
+    subparsers = parser.add_subparsers(help="Commands", dest="command")
+
+    glove_parser = subparsers.add_parser(
+        "glove", help="Convert proximity matrices into GloVe suitable format. Refer to "
+        "https://github.com/stanfordnlp/GloVe",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, vocab_arg])
+    glove_parser.set_defaults(handler=glove_entry)
+    glove_parser.add_argument("input", help="Input directory with proximity matrices.")
+    glove_parser.add_argument("output", help="Path to store combined proximity matrix.")
+    glove_parser.add_argument("--filter", default="**/*.asdf", help="File name glob selector.")
+
+    node2vec_parser = subparsers.add_parser(
+        "node2vec", help="Node2Vec random walk algorithm for assembling proximity matrices from "
+        "UASTs. Refer to https://github.com/aditya-grover/node2vec",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, vocab_arg, uast_input_arg])
+    node2vec_parser.set_defaults(handler=node2vec_entry)
+    node2vec_parser.add_argument("output", help="Path to store the resulting matrices.")
+    node2vec_parser.add_argument(
+        "-n", "--num-walks", type=int, default=1, help="Number of random walks from each node.")
+    node2vec_parser.add_argument(
+        "-l", "--walk-length", type=int, default=80, help="Length of each random walk.")
+    node2vec_parser.add_argument(
+        "-w", "--window", type=int, default=5, help="Window size for node context.")
+    node2vec_parser.add_argument(
+        "-p", type=float, default=1.0,
+        help="Controls the likelihood of immediately revisiting previous node.")
+    node2vec_parser.add_argument(
+        "-q", type=float, default=1.0, help="Controls the likelihood of exploring outward nodes.")
+
+    roles_parser = subparsers.add_parser(
+        "mlp", help="Train/test roles prediction model.",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg])
+    roles_parser.set_defaults(handler=roles_entry)
+    roles_parser.add_argument(
+        "algorithm", choices=ROLES_MODELS.keys(), help="Specify training algorithm.")
+    roles_parser.add_argument("--train", help="Input file with UASTs for training.")
+    roles_parser.add_argument("--test", help="Input file with UASTs for testing.")
+    roles_parser.add_argument("--model", required=True, help="Path to store trained model.")
+    roles_parser.add_argument(
+        "--embeddings", required=True, help="File with roles and tokens embeddings.")
+
+    stats_parser = subparsers.add_parser(
+        "stats", help="Collect statistics for number of nodes w.r.t. number of node roles in "
+        "UASTs.", formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, uast_input_arg])
+    stats_parser.set_defaults(handler=stats_entry)
+    stats_parser.add_argument("--stat", required=True, help="Path to store resulting statisics.")
+    stats_parser.add_argument("--susp", required=True, help="Path to store suspicious UASTs.")
+
+    vocab_parser = subparsers.add_parser(
+        "vocab", help="Collect vocabulary from UASTs.",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, uast_input_arg])
+    vocab_parser.set_defaults(handler=vocab_entry)
+    vocab_parser.add_argument("output", default="vocab.txt", help="Path to store vocabulary.")
+
+    return parser
+
+
+def main():
+    """
+    Create all the argparsers and invoke the function from set_defaults().
+
+    :return: The result of the function from set_defaults().
+    """
+    parser = get_parser()
+    args = parser.parse_args()
+    args.log_level = logging._nameToLevel[args.log_level]
+    setup_logging(args.log_level)
+    try:
+        handler = args.handler
+    except AttributeError:
+        def print_usage(_):
+            parser.print_usage()
+
+        handler = print_usage
+    return handler(args)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/role2vec/glove.py b/role2vec/glove.py
new file mode 100644
index 0000000..beb01b5
--- /dev/null
+++ b/role2vec/glove.py
@@ -0,0 +1,91 @@
+from collections import Counter
+from pathlib import Path
+import struct
+from typing import Dict, List, Tuple
+
+from ast2vec.coocc import Cooccurrences
+from role2vec.map_reduce import MapReduce
+from role2vec.utils import read_vocab
+
+
+class GloVe(MapReduce):
+    """
+    Converts proximity matrices into GloVe suitable format.
+    Refer to https://github.com/stanfordnlp/GloVe
+    """
+
+    def __init__(self, log_level: str, num_processes: int, vocab_path: str):
+        """
+        :param log_level: Log level of GloVe.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param vocab_path: Path to stored vocabulary.
+        """
+        super(GloVe, self).__init__(log_level=log_level, num_processes=num_processes)
+        self.vocab = {word: i for i, word in enumerate(read_vocab(vocab_path))}
+
+    def convert(self, src_dir: str, output: str, file_filter: str) -> None:
+        """
+        Combine all proximity matrices and save them into GloVe suitable format.
+
+        :param src_dir: Path to stored proximity matrices.
+        :param output: Path for storing the resulting GloVe suitable matrix.
+        :param file_filter: Pattern for recursively scanning `src_dir`.
+        """
+        self._log.info("Scanning %s", src_dir)
+        files = [str(p) for p in Path(src_dir).glob(file_filter)]
+        self._log.info("Found %d files", len(files))
+        if not files:
+            return 0
+
+        self._log.info("Combine proximity matrices.")
+        mat = self.combine_mats(files)
+        self._log.info("Finished combining.")
+
+        self._log.info("Saving matrix.")
+        self.save_mat(mat, output)
+
+    def combine_mats(self, files: List[str]) -> Dict[Tuple[str, str], int]:
+        """
+        Combine proximity matrices.
+
+        :param files: List of filepaths to stored proximity matrices.
+        :return: Mapping from token pairs to their proximity combined over all matrices.
+        """
+        counter = Counter()
+
+        @MapReduce.wrap_queue_in
+        def process_prox(self, filename):
+            prox = Cooccurrences().load(filename)
+            return {(prox.tokens[i], prox.tokens[j]): val for
+                    i, j, val in zip(prox.matrix.row, prox.matrix.col, prox.matrix.data)}
+
+        @MapReduce.wrap_queue_out()
+        def combine_prox(result):
+            nonlocal counter
+            counter.update(
+                {(self.vocab[i], self.vocab[j]): val for (i, j), val in result.items()
+                 if i in self.vocab and j in self.vocab})
+
+        self.parallelize(files, process_prox, combine_prox)
+        return counter
+
+    @staticmethod
+    def save_mat(mat: Dict[Tuple[str, str], int], output: str) -> None:
+        """
+        Save matrix in GloVe suitable format.
+
+        :param mat: Counter storing proximities.
+        :param output: Path for storing the resulting GloVe suitable matrix.
+        """
+        with open(output, "wb") as fout:
+            for (i, j), val in mat.items():
+                fout.write(struct.pack("iid", i, j, int(val)))
+
+    def _get_log_name(self):
+        return "GloVe"
+
+
+def glove_entry(args):
+    glove = GloVe(args.log_level, args.processes, args.vocabulary)
+    glove.convert(args.input, args.output, args.filter)
diff --git a/role2vec/map_reduce.py b/role2vec/map_reduce.py
new file mode 100644
index 0000000..eb9dced
--- /dev/null
+++ b/role2vec/map_reduce.py
@@ -0,0 +1,111 @@
+import multiprocessing
+import time
+from typing import List
+
+from ast2vec.pickleable_logger import PickleableLogger
+
+
+class MapReduce(PickleableLogger):
+    """
+    Base class for parallel data processign. Creates a pool of workers for data mangling and
+    reduces data in the main process.
+    """
+
+    def __init__(self, log_level: str, num_processes: int, queue_lim: int=100):
+        """
+        :param log_level: Log level of MapReduce.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param queue_lim: Maximum number of results in queue for reducing.
+        """
+        super(MapReduce, self).__init__(log_level=log_level)
+        self.num_processes = num_processes
+        self.queue_lim = queue_lim
+
+    def parallelize(self, tasks: List[str], process_queue_in, process_queue_out) -> int:
+        """
+        Process tasks in parallel.
+
+        :param tasks: List of filenames.
+        :param process_queue_in: Function for processing items from the task queue.
+        :param process_queue_out: Function for processing items from the result queue.
+        :return: Number of failed tasks.
+        """
+        queue_in = multiprocessing.Manager().Queue()
+        queue_out = multiprocessing.Manager().Queue(self.queue_lim)
+        processes = [multiprocessing.Process(target=process_queue_in,
+                                             args=(self, queue_in, queue_out))
+                     for i in range(self.num_processes)]
+        n_tasks = len(tasks)
+        start_time = time.time()
+
+        self._log.info("Starting tasks.")
+        for p in processes:
+            p.start()
+        for t in tasks:
+            queue_in.put(t)
+        for _ in processes:
+            queue_in.put(None)
+
+        failures = process_queue_out(self, n_tasks, queue_out)
+        for p in processes:
+            p.join()
+
+        self._log.info("Finished %d/%d tasks in %.2f" %
+                       (n_tasks - failures, n_tasks, time.time() - start_time))
+        return len(tasks) - failures
+
+    @staticmethod
+    def wrap_queue_in(func):
+        """
+        Wrapper for automatic quering of tasks and storing results in the result queue.
+
+        :param func: Function that can process a single task and accepts `self` as parameter.
+        """
+        def wrapper(self, queue_in, queue_out):
+            while True:
+                item = queue_in.get()
+                if item is None:
+                    break
+                try:
+                    queue_out.put(func(self, item))
+                except:
+                    self._log.exception("%s failed", item)
+                    queue_out.put(None)
+        return wrapper
+
+    @staticmethod
+    def wrap_queue_out(freq: int=1000):
+        """
+        Wrapper for allowing parametrization.
+
+        :param freq: Logs information every `freq` iterations.
+        """
+        def outer_wrapper(func):
+            """
+            Wrapper for automatic quering of results and reducing them.
+
+            :param func: Function that can process a result and accepts `self` as parameter.
+            """
+            def wrapper(self, n_tasks, queue_out):
+                failures = 0
+                start = time.time()
+
+                for i in range(n_tasks):
+                    result = queue_out.get()
+                    if (i + 1) % freq == 0:
+                        self._log.info("Processed %d/%d in %.2f" %
+                                       (i + 1, n_tasks, time.time() - start))
+                    if result is None:
+                        failures += 1
+                        continue
+                    func(self, result)
+
+                self._log.info("Finished %d/%d in %.2f seconds" %
+                               (i + 1, n_tasks, time.time() - start))
+                return failures
+            return wrapper
+        return outer_wrapper
+
+    def _get_log_name(self):
+        return "MapReduce"
diff --git a/role2vec/node2vec.py b/role2vec/node2vec.py
new file mode 100644
index 0000000..81071dc
--- /dev/null
+++ b/role2vec/node2vec.py
@@ -0,0 +1,122 @@
+from collections import defaultdict
+from itertools import product
+import os
+from typing import List
+
+import numpy
+from scipy.sparse import coo_matrix, diags
+
+from ast2vec.coocc import Cooccurrences
+from ast2vec.uast import UASTModel
+from role2vec.map_reduce import MapReduce
+from role2vec.random_walk import Graph
+from role2vec.utils import read_paths, read_vocab
+
+
+class Node2Vec(MapReduce):
+    """
+    Uses Node2Vec random walk algorithm for assembling proximity matrices from UASTs.
+    Refer to https://github.com/aditya-grover/node2vec
+    """
+
+    MAX_VOCAB_WORDS = 1000000
+
+    def __init__(self, log_level: str, num_processes: int, vocab_path: str, window: int,
+                 graph: Graph):
+        """
+        :param log_level: Log level of Node2Vec.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param vocab_path: Path to stored vocabulary.
+        :param window: Context window size for collecting proximities.
+        :param graph: Graph object for random walks generation.
+        """
+        super(Node2Vec, self).__init__(log_level=log_level, num_processes=num_processes)
+        self.graph = graph
+        self.vocab = {w: i for i, w in enumerate(read_vocab(vocab_path, Node2Vec.MAX_VOCAB_WORDS))}
+        self.window = window
+
+    def process(self, fname: str, output_dir: str) -> None:
+        """
+        Extract proximity matrices from UASTs.
+
+        :param fname: Path to file with filepaths to stored UASTs.
+        :param output_dir: Path to directory for storing proximity matrices.
+        """
+        self._log.info("Scanning %s", fname)
+        paths = read_paths(fname)
+        self._log.info("Found %d files", len(paths))
+
+        @MapReduce.wrap_queue_in
+        def process_uast(self, obj):
+            filename, output = obj
+            self._log.info("Processing %s", filename)
+            uast = UASTModel().load(filename)
+            dok_matrix = defaultdict(int)
+
+            for walk in self.graph.simulate_walks(uast):
+                walk = [[self.vocab[t] for t in map(str, node.tokens)
+                        if t in self.vocab] for node in walk]
+                # Connect each token to the next `self.window` tokens.
+                for i, cur_tokens in enumerate(walk[:-1]):
+                    for next_tokens in walk[(i + 1):(i + self.window)]:
+                        for word1, word2 in product(cur_tokens, next_tokens):
+                            # Symmetry will be accounted for later
+                            dok_matrix[(word1, word2)] += 1
+
+            del uast
+
+            mat = coo_matrix(
+                (Node2Vec.MAX_VOCAB_WORDS, Node2Vec.MAX_VOCAB_WORDS), dtype=numpy.int32)
+            mat.row = row = numpy.empty(len(dok_matrix), dtype=numpy.int32)
+            mat.col = col = numpy.empty(len(dok_matrix), dtype=numpy.int32)
+            mat.data = data = numpy.empty(len(dok_matrix), dtype=numpy.int32)
+            for i, (coord, val) in enumerate(sorted(dok_matrix.items())):
+                row[i], col[i] = coord
+                data[i] = val
+
+            del dok_matrix
+            # Accounting for symmetry
+            mat = coo_matrix(mat + mat.T - diags(mat.diagonal()))
+
+            coocc = Cooccurrences()
+            coocc.construct(tokens=sorted(self.vocab, key=self.vocab.get), matrix=mat)
+            coocc.save(output)
+            self._log.info("Finished processing %s", filename)
+            return filename
+
+        @MapReduce.wrap_queue_out()
+        def process_output(self, result):
+            pass
+
+        self._log.info("Preprocessing file names.")
+        paths = self._preprocess_paths(paths, output_dir)
+        self.parallelize(paths, process_uast, process_output)
+
+    def _get_log_name(self):
+        return "Node2Vec"
+
+    def _preprocess_paths(self, paths: List[str], output_dir: str) -> List[str]:
+        """
+        Prepare paths for storing proximity matrices.
+
+        :param paths: List of filepaths to stored UASTs.
+        :param output_dir: Path to directory for storing proximity matrices.
+        :return: List of filepaths for storing proximity matrices.
+        """
+        preprocessed_paths = []
+        for p in paths:
+            name = os.path.basename(p)
+            if name.startswith("uast_"):
+                name = name[len("uast_"):]
+            out_dir = os.path.join(output_dir, name[0])
+            os.makedirs(out_dir, exist_ok=True)
+            out_fname = os.path.join(out_dir, name)
+            preprocessed_paths.append((p, out_fname))
+        return preprocessed_paths
+
+
+def node2vec_entry(args):
+    graph = Graph(args.log_level, args.num_walks, args.walk_length, args.p, args.q)
+    node2vec = Node2Vec(args.log_level, args.processes, args.vocabulary, args.window, graph)
+    node2vec.process(args.input, args.output)
diff --git a/role2vec/random_walk.py b/role2vec/random_walk.py
new file mode 100644
index 0000000..3df5dbd
--- /dev/null
+++ b/role2vec/random_walk.py
@@ -0,0 +1,187 @@
+from collections import namedtuple
+import random
+from typing import Dict, Iterator, List, Tuple
+
+import numpy as np
+
+from ast2vec.pickleable_logger import PickleableLogger
+from ast2vec.token_parser import TokenParser
+from role2vec.utils import node_iterator
+
+GraphNode = namedtuple("GraphNode", ["id", "neighbors", "tokens"])
+
+
+class Graph(PickleableLogger):
+    """
+    Generates random walks from UASTs.
+    """
+
+    def __init__(self, log_level: str, num_walks: int, walk_length: int, p: float, q: float):
+        """
+        :param log_level: Log level of Node2Vec.
+        :param num_walks: Number of random walks from each node.
+        :param walk_length: Random walk length.
+        :param p: Controls the likelihood of immediately revisiting previous node.
+        :param q: Controls the likelihood of exploring outward nodes.
+        """
+        if walk_length <= 1:
+            raise ValueError("Random walks have at least two nodes.")
+
+        super(Graph, self).__init__(log_level=log_level)
+        self.num_walks = num_walks
+        self.walk_length = walk_length
+        self.p = 1 / p
+        self.q = 1 / q
+        self.token_parser = TokenParser()
+
+    def node2vec_walk(self, start_node: GraphNode, edges: Dict[Tuple[int, int], None],
+                      nodes: List[GraphNode]) -> List[GraphNode]:
+        """
+        Simulate a random walk starting from start node.
+
+        :param start_node: Starting node for random walk.
+        :param edges: Dict for storing mapping from node id pairs to transition probabilities.
+        :param nodes: List of UAST nodes.
+        :return: List of GraphNodes in random walk.
+        """
+        walk = [None] * self.walk_length
+        prev_node = walk[0] = start_node
+        cur_node = walk[1] = nodes[random.choice(start_node.neighbors)]
+
+        for i in range(2, self.walk_length):
+            J, q = edges[(prev_node.id, cur_node.id)]
+            kk = np.random.randint(len(J))
+
+            # Draw a sample from discrete distribution at constant time.
+            if np.random.rand() < q[kk]:
+                ind = kk
+            else:
+                ind = J[kk]
+
+            prev_node = cur_node
+            cur_node = walk[i] = nodes[cur_node.neighbors[ind]]
+
+        return walk
+
+    def simulate_walks(self, uasts) -> Iterator[List[GraphNode]]:
+        """
+        Repeatedly simulate random walks from each node.
+
+        :param uasts: List of UASTs.
+        :return: Iterator over random walks generated for the input UASTs.
+        """
+        for uast, filename in zip(uasts.uasts, uasts.filenames):
+            nodes, edges = self._preprocess_uast(uast)
+            n_nodes = len(nodes)
+
+            if n_nodes == 1:
+                self._log.info("Skipping UAST for %s: has a single node." % filename)
+                continue
+
+            self._preprocess_transition_probs(nodes, edges)
+            self._log.info("Walk iteration:")
+
+            for walk_iter in range(self.num_walks):
+                self._log.info("%d/%d" % (walk_iter + 1, self.num_walks))
+                iter_nodes = set(node.id for node in nodes)
+
+                while iter_nodes:
+                    node = nodes[random.sample(iter_nodes, 1)[0]]
+                    walk = self.node2vec_walk(node, edges, nodes)
+                    yield walk
+
+                    for walk_node in walk:
+                        if walk_node.id in iter_nodes:
+                            iter_nodes.remove(walk_node.id)
+
+    def _get_log_name(self):
+        return "Graph"
+
+    def _get_tokens(self, uast_node) -> List[str]:
+        """
+        Return node tokens.
+
+        :param uast_node: UAST node.
+        :return: List of tokens.
+        """
+        return ["RoleId_%d" % role for role in uast_node.roles] + \
+            list(self.token_parser.process_token(uast_node.token))
+
+    def _preprocess_transition_probs(self, nodes: List[GraphNode],
+                                     edges: Dict[Tuple[int, int], None]) -> None:
+        """
+        Preprocessing of transition probabilities for guiding the random walks.
+
+        :param nodes: List of GraphNodes in UAST.
+        :param edges: Dict for storing mapping from node id pairs to transition probabilities.
+        """
+        self._log.info("Preprocessing transition probabilities.")
+        for edge in edges:
+            unnormalized_probs = np.array([
+                self.p if dst_nbr == edge[0] else
+                1 if (dst_nbr, edge[0]) in edges else
+                self.q for dst_nbr in nodes[edge[1]].neighbors
+            ])
+            edges[edge] = alias_setup(unnormalized_probs / unnormalized_probs.sum())
+
+    def _preprocess_uast(self, root) -> Tuple[List[GraphNode], Dict[Tuple[int, int], None]]:
+        """
+        Add neighbors information to UAST nodes.
+
+        :param root: Root node in UAST.
+        :return: Nodes and edges in the UAST.
+        """
+        def create_node(node, id):
+            return GraphNode(id=id, neighbors=[], tokens=self._get_tokens(node))
+
+        self._log.info("Preprocessing UAST nodes.")
+        edges = {}
+        nodes = [create_node(root, 0)]
+        n_nodes = 1
+
+        for node, node_idx in node_iterator(root):
+            for child in node.children:
+                nodes.append(create_node(child, n_nodes))
+                nodes[n_nodes].neighbors.append(node_idx)
+                nodes[node_idx].neighbors.append(n_nodes)
+                edges[(node_idx, n_nodes)] = edges[(n_nodes, node_idx)] = None
+                n_nodes += 1
+
+        return nodes, edges
+
+
+def alias_setup(probs: np.array) -> Tuple[np.array, np.array]:
+    """
+    Compute utility lists for non-uniform sampling from discrete distributions.
+    Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with
+    -many-discrete-outcomes/
+    for details
+
+    :param probs: Discrete distribution.
+    :return: Two helper tables.
+    """
+    K = len(probs)
+    q = probs * K
+    J = np.zeros(K, dtype=np.int)
+
+    # Sort the data into the outcomes with probabilities that are larger and smaller than 1/K.
+    smaller = np.where(q < 1.0)[0]
+    larger = np.where(q >= 1.0)[0]
+    s_idx = len(smaller) - 1
+    l_idx = len(larger) - 1
+
+    # Loop through and create little binary mixtures that appropriately allocate the larger
+    # outcomes over the overall uniform mixture.
+    while s_idx >= 0 and l_idx >= 0:
+        small = smaller[s_idx]
+        large = larger[l_idx]
+        J[small] = large
+        q[large] += q[small] - 1.0
+
+        if q[large] < 1.0:
+            smaller[s_idx] = large
+            l_idx -= 1
+        else:
+            s_idx -= 1
+
+    return J, q
diff --git a/role2vec/roles/base.py b/role2vec/roles/base.py
new file mode 100644
index 0000000..8f5b761
--- /dev/null
+++ b/role2vec/roles/base.py
@@ -0,0 +1,92 @@
+import os
+
+from sklearn.externals import joblib
+
+from ast2vec.token_parser import TokenParser
+from role2vec.map_reduce import MapReduce
+from role2vec.utils import read_embeddings
+
+ROLES_MODELS = dict()
+
+
+def register_roles_model(cls):
+    """
+    Check some conventions for class declaration and add it to ROLES_MODELS.
+
+    :param cls: Class for roles prediction.
+    """
+    base = "Roles"
+    assert issubclass(cls, RolesBase), "Must be a subclass of RolesBase."
+    assert cls.__name__.startswith(base), "Make sure to start your class name with %s." % (base, )
+    ROLES_MODELS[cls.__name__[len(base):].lower()] = cls
+
+    return cls
+
+
+class RolesBase(MapReduce):
+    """
+    Base class for roles prediction.
+    """
+
+    def __init__(self, log_level: str, num_processes: int, emb_path: str):
+        """
+        :param log_level: Log level of RolesBase.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param emb_path: Path to stored roles embeddings.
+        """
+        super(RolesBase, self).__init__(log_level=log_level, num_processes=num_processes)
+        self.emb, self.roles = read_embeddings(emb_path)
+        self.model = None
+        self.token_parser = TokenParser()
+
+    def save(self, model_path: str) -> None:
+        """
+        Store trained model on disk.
+
+        :param model_path: Path for storing trained model.
+        """
+        if self.model is None:
+            raise ValueError("Model is empty.")
+        self._log.info("Saving model to %s.", model_path)
+        joblib.dump(self.model, model_path)
+
+    def load(self, model_path: str) -> None:
+        """
+        Load trained model from disk.
+
+        :param model_path: Path to trained model.
+        """
+        if not os.path.exists(model_path):
+            raise ValueError("Provided path to model doesn't exist: %s", model_path)
+        self.model = joblib.load(model_path)
+
+    def train(self, fname: str) -> None:
+        """
+        Train model.
+
+        :param fname: Path to train file with filepaths to stored UASTs.
+        """
+        raise NotImplementedError
+
+    def test(self, fname: str) -> None:
+        """
+        Test model.
+
+        :param fname: Path to test file with filepaths to stored UASTs.
+        """
+        raise NotImplementedError
+
+
+def roles_entry(args):
+    RolesModel = ROLES_MODELS[args.algorithm]
+    rm = RolesModel(args.log_level, args.processes, args.embeddings)
+
+    if args.train:
+        rm.train(args.train)
+        rm.save(args.model)
+    else:
+        rm.load(args.model)
+
+    if args.test:
+        rm.test(args.test)
diff --git a/role2vec/roles/mlp.py b/role2vec/roles/mlp.py
new file mode 100644
index 0000000..7c85b52
--- /dev/null
+++ b/role2vec/roles/mlp.py
@@ -0,0 +1,141 @@
+from itertools import chain
+import time
+from typing import Dict, Tuple
+
+import numpy as np
+from sklearn.neural_network import MLPClassifier
+
+from ast2vec.uast import UASTModel
+from role2vec.map_reduce import MapReduce
+from role2vec.roles_base import register_roles_model, RolesBase
+from role2vec.utils import node_iterator, read_paths
+
+
+@register_roles_model
+class RolesMLP(RolesBase):
+    """
+    Predicts roles using Multi-Layer Perceptron.
+    """
+
+    def train(self, fname: str) -> None:
+        """
+        Train model.
+
+        :param fname: Path to train file with filepaths to stored UASTs.
+        """
+        paths = read_paths(fname)
+
+        self._log.info("Train model.")
+        self.model = MLPClassifier(random_state=1, verbose=True)
+        self.model.classes_ = sorted(self.roles.values())
+        counter = 0
+        start = time.time()
+
+        @MapReduce.wrap_queue_out()
+        def train_uast(self, result):
+            nonlocal counter, start
+            X, y = result
+            counter += 1
+            self.model.partial_fit(X, y)
+            print(self.model.loss_, time.time() - start, counter)
+
+        self.parallelize(paths, _process_uast, train_uast)
+        self._log.info("Finished training.")
+
+    def test(self, fname: str) -> None:
+        """
+        Test model.
+
+        :param fname: Path to test file with filepaths to stored UASTs.
+        """
+        paths = read_paths(fname)
+
+        self._log.info("Test model.")
+        y_real, y_pred = [], []
+
+        @MapReduce.wrap_queue_out()
+        def test_uast(self, result):
+            nonlocal y_real, y_pred
+            X, y = result
+            y_real.extend(y)
+            y_pred.extend(self.model.predict_proba(X))
+
+        self.parallelize(paths, _process_uast, test_uast)
+        np.save("y_real.npy", y_real)
+        np.save("y_pred.npy", y_pred)
+        self._log.info("Finished testing.")
+
+    def _mean_vec(self, node) -> Tuple[np.array, int]:
+        """
+        Calculate mean of role/token embeddings for a node.
+
+        :param node: UAST node.
+        :return: Mean of role/token embeddings and their total number.
+        """
+        tokens = [t for t in chain(node.token, ("RoleId_%d" % role for role in node.roles))
+                  if t in self.emb]
+        if not tokens:
+            return None, 0
+        return np.mean([self.emb[t] for t in tokens], axis=0), len(tokens)
+
+    def _mean_vecs(self, root) -> Tuple[Dict[int, np.array], Dict[int, np.array]]:
+        """
+        Calculate mean of role/token embeddings for nodes and their children in a UAST.
+
+        :param root: UAST root node.
+        :return: Mappings from node indices to their parent's and their childrens' mean role/token
+                 embeddings.
+        """
+        node_vecs = {0: self._mean_vec(root)}
+        child_vecs = {}
+        parent_vecs = {0: None}
+        n_nodes = 1  # incremented in accoradance with node_iterator
+
+        for node, node_idx in node_iterator(root):
+            node_child_vecs = []
+            node_child_ns = []
+
+            for child in node.children:
+                child_vec = self._mean_vec(child)
+                node_vecs[n_nodes] = child_vec
+                parent_vecs[n_nodes] = node_vecs[node_idx][0]
+                node_child_vecs.append(child_vec[0])
+                node_child_ns.append(child_vec[1])
+                n_nodes += 1
+
+            node_child_vecs = list(filter(lambda x: x is not None, node_child_vecs))
+            node_child_ns = list(filter(lambda x: x != 0, node_child_ns))
+
+            if node_child_vecs:
+                child_vecs[node_idx] = np.average(node_child_vecs, axis=0, weights=node_child_ns)
+            else:
+                child_vecs[node_idx] = None
+
+        return child_vecs, parent_vecs
+
+
+@MapReduce.wrap_queue_in
+def _process_uast(self, filename: str) -> Tuple[np.array, np.array]:
+    """
+    Convert UAST into feature and label arrays.
+    Had to be defined outside of RolesMLP so that we don't suppply `self` twice.
+
+    :param filename: Path to stored UAST.
+    :return: Array of concatenated mean parent and children role/token embeddings for each node and
+             the corresponding array of node roles.
+    """
+    X, y = [], []
+    uast_model = UASTModel().load(filename)
+
+    for uast in uast_model.uasts:
+        child_vecs, parent_vecs = self._mean_vecs(uast)
+        for node, node_idx in node_iterator(uast):
+            child_vec = child_vecs[node_idx]
+            parent_vec = parent_vecs[node_idx]
+            if child_vec is not None and parent_vec is not None:
+                labels = np.zeros(len(self.roles), dtype=np.int8)
+                labels[[self.roles["RoleId_%d" % role] for role in node.roles]] = 1
+                X.append(np.concatenate((child_vec, parent_vec)))
+                y.append(labels)
+
+    return np.array(X), np.array(y)
diff --git a/role2vec/stats.py b/role2vec/stats.py
new file mode 100644
index 0000000..2bfa2d2
--- /dev/null
+++ b/role2vec/stats.py
@@ -0,0 +1,56 @@
+from collections import Counter
+import json
+
+from ast2vec.uast import UASTModel
+from role2vec.map_reduce import MapReduce
+from role2vec.utils import node_iterator, read_paths
+
+
+class RolesStats(MapReduce):
+    """
+    Collects statistics for number of nodes w.r.t. number of node roles in all UASTs.
+    """
+
+    def calc(self, fname: str, stat_output: str, susp_output: str) -> None:
+        """
+        Compute statistics and store them in JSON format.
+
+        :param fname: Path to file with filepaths to stored UASTs.
+        :param stat_output: Path for storing JSON file with statistics.
+        :param susp_output: Path for storing txt file with info about suspicious UASTs. The file
+                            has three columns: filepath to UAST, number of nodes in UAST, number of
+                            nodes without roles in UAST.
+        """
+        paths = read_paths(fname)
+        global_counter = Counter()
+        suspicious = []
+
+        @MapReduce.wrap_queue_in
+        def process_uast(self, filename):
+            counter = Counter()
+            uast_model = UASTModel().load(filename)
+            for uast in uast_model.uasts:
+                for node, _ in node_iterator(uast):
+                    counter[len(node.roles)] += 1
+            return counter, filename
+
+        @MapReduce.wrap_queue_out()
+        def combine_stat(self, result):
+            nonlocal global_counter
+            counter, filename = result
+            global_counter.update(counter)
+            if 0 in counter:
+                suspicious.append((filename, sum(counter.values()), counter[0]))
+
+        self.parallelize(paths, process_uast, combine_stat)
+        with open(stat_output, "w") as fout:
+            json.dump(global_counter, fout)
+        with open(susp_output, "w") as fout:
+            for susp_entry in suspicious:
+                fout.write(", ".join(map(str, susp_entry)) + "\n")
+        self._log.info("Finished collecting statistics.")
+
+
+def stats_entry(args):
+    role_stat = RolesStats(args.log_level, args.processes)
+    role_stat.calc(args.input, args.stat, args.susp)
diff --git a/role2vec/tests/__init__.py b/role2vec/tests/__init__.py
new file mode 100644
index 0000000..9aeddfc
--- /dev/null
+++ b/role2vec/tests/__init__.py
@@ -0,0 +1,5 @@
+from modelforge.logs import setup_logging
+
+
+def setup():
+    setup_logging("INFO")
diff --git a/role2vec/tests/models.py b/role2vec/tests/models.py
new file mode 100644
index 0000000..8a9feb9
--- /dev/null
+++ b/role2vec/tests/models.py
@@ -0,0 +1,5 @@
+import os
+
+UAST = os.path.join(os.path.dirname(__file__), "uast.asdf")
+UAST_FILE = os.path.join(os.path.dirname(__file__), "uast.txt")
+VOCAB = os.path.join(os.path.dirname(__file__), "vocab.txt")
diff --git a/role2vec/tests/test_roles_base.py b/role2vec/tests/test_roles_base.py
new file mode 100644
index 0000000..bee53d0
--- /dev/null
+++ b/role2vec/tests/test_roles_base.py
@@ -0,0 +1,51 @@
+import os
+import tempfile
+import unittest
+
+from sklearn.externals import joblib
+
+from role2vec.roles.base import RolesBase
+
+
+class RolesBaseTests(unittest.TestCase):
+    def setUp(self):
+        self.model = 1334
+        with tempfile.NamedTemporaryFile(delete=False) as model_path:
+            self.model_path = model_path.name
+            joblib.dump(self.model, self.model_path)
+        with tempfile.NamedTemporaryFile() as emb_path:
+            self.rb = RolesBase(log_level="INFO", num_processes=1, emb_path=emb_path.name)
+
+    def tearDown(self):
+        os.remove(self.model_path)
+
+    def test_save(self):
+        with self.assertRaises(ValueError):
+            self.rb.save("")
+        try:
+            self.rb.model = self.model
+            with tempfile.NamedTemporaryFile() as model_path:
+                self.assertIsNone(self.rb.save(model_path.name))
+        finally:
+            self.rb.model = None
+
+    def test_load(self):
+        with self.assertRaises(ValueError):
+            self.rb.load("")
+        try:
+            self.rb.load(self.model_path)
+            self.assertEqual(self.rb.model, self.model)
+        finally:
+            self.rb.model = None
+
+    def test_train(self):
+        with self.assertRaises(NotImplementedError):
+            self.rb.train("")
+
+    def test_test(self):
+        with self.assertRaises(NotImplementedError):
+            self.rb.test("")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/role2vec/tests/test_stats.py b/role2vec/tests/test_stats.py
new file mode 100644
index 0000000..aa03c2a
--- /dev/null
+++ b/role2vec/tests/test_stats.py
@@ -0,0 +1,23 @@
+import json
+import tempfile
+import unittest
+
+from role2vec.stats import RolesStats
+import role2vec.tests.models as paths
+
+
+class RolesStatsTests(unittest.TestCase):
+    def setUp(self):
+        self.rs = RolesStats(log_level="INFO", num_processes=1)
+
+    def test_calc(self):
+        with tempfile.NamedTemporaryFile() as stat, tempfile.NamedTemporaryFile() as susp:
+            self.rs.calc(paths.UAST_FILE, stat.name, susp.name)
+            role_stats = json.loads(stat.read().decode("utf8"))
+            self.assertEqual(role_stats, {"0": 1, "1": 498, "2": 830, "3": 1634, "4": 1407,
+                                          "5": 412, "6": 718, "7": 2, "8": 4, "10": 359,
+                                          "11": 411})
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/role2vec/tests/test_vocab.py b/role2vec/tests/test_vocab.py
new file mode 100644
index 0000000..2606d58
--- /dev/null
+++ b/role2vec/tests/test_vocab.py
@@ -0,0 +1,23 @@
+import unittest
+
+from role2vec.vocab import Vocab
+import role2vec.tests.models as paths
+
+
+class VocabTests(unittest.TestCase):
+    def setUp(self):
+        self.vocab = Vocab(log_level="INFO", num_processes=1)
+        self.words_true = {}
+        with open(paths.VOCAB) as fin:
+            for line in fin:
+                word, count = line.split()
+                self.words_true[word] = int(count)
+
+    def test_create(self):
+        words = self.vocab.create([paths.UAST])
+        self.assertEqual(len(words), 539)
+        self.assertEqual(words, self.words_true)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/role2vec/tests/uast.asdf b/role2vec/tests/uast.asdf
new file mode 100644
index 0000000..b45d4b4
Binary files /dev/null and b/role2vec/tests/uast.asdf differ
diff --git a/role2vec/tests/uast.txt b/role2vec/tests/uast.txt
new file mode 100644
index 0000000..280aa45
--- /dev/null
+++ b/role2vec/tests/uast.txt
@@ -0,0 +1 @@
+role2vec/tests/uast.asdf
\ No newline at end of file
diff --git a/role2vec/tests/vocab.txt b/role2vec/tests/vocab.txt
new file mode 100755
index 0000000..47e21cf
--- /dev/null
+++ b/role2vec/tests/vocab.txt
@@ -0,0 +1,539 @@
+RoleId_18 5226
+RoleId_1 4165
+RoleId_85 2939
+RoleId_49 1908
+RoleId_45 1728
+RoleId_41 1095
+RoleId_47 1082
+RoleId_4 863
+RoleId_89 833
+RoleId_87 774
+RoleId_2 719
+RoleId_86 666
+RoleId_48 624
+RoleId_99 568
+self 530
+RoleId_110 349
+path 344
+RoleId_6 328
+RoleId_7 326
+RoleId_3 321
+RoleId_19 306
+RoleId_61 281
+bucket 262
+blob 228
+RoleId_105 211
+RoleId_46 211
+content 200
+assert 175
+model 154
+RoleId_50 147
+RoleId_80 135
+RoleId_94 105
+test 103
+RoleId_11 101
+RoleId_107 97
+manag 91
+RoleId_109 90
+RoleId_63 87
+name 87
+RoleId_62 87
+checkpoint 85
+return 83
+equal 82
+exists 78
+true 78
+file 78
+RoleId_42 71
+get 67
+RoleId_79 67
+RoleId_96 63
+nil 62
+RoleId_100 55
+false 55
+type 50
+RoleId_35 48
+format 47
+RoleId_43 46
+RoleId_68 46
+the 45
+delete 45
+RoleId_103 44
+notebook 44
+old 44
+directori 44
+RoleId_95 38
+new 38
+RoleId_81 37
+RoleId_21 37
+other 36
+dir 36
+list 34
+RoleId_20 34
+RoleId_5 32
+string 32
+raise 30
+RoleId_83 30
+from 29
+txt 28
+not 27
+save 26
+base 25
+google 24
+for 23
+upload 23
+blobs 21
+creat 21
+instanc 21
+last 21
+modifi 20
+web 20
+RoleId_64 20
+error 19
+debug 19
+storag 19
+rror 19
+text 18
+RoleId_39 18
+folder 18
+max 18
+httpe 18
+RoleId_91 18
+isinst 17
+client 17
+hidden 17
+ishidden 16
+mimetyp 16
+parse 15
+default 15
+cloud 15
+args 15
+applic 14
+RoleId_82 14
+class 14
+param 14
+size 13
+log 13
+RoleId_93 13
+json 13
+parent 13
+nbformat 13
+rename 13
+unicod 12
+RoleId_27 12
+cache 12
+RoleId_17 12
+RoleId_71 12
+endswith 12
+RoleId_70 12
+fetch 11
+throw 11
+errno 11
+create 10
+RoleId_26 10
+help 10
+config 10
+ipynb 10
+uuid 10
+startswith 10
+obj 10
+writabl 9
+RoleId_101 9
+result 9
+none 9
+will 9
+hook 9
+xdirectori 9
+pickle 9
+prefix 9
+RoleId_77 8
+isnone 8
+ofthe 8
+updat 8
+RoleId_15 8
+RoleId_78 8
+post 8
+acheckpoint 7
+dict 7
+dotted 7
+project 7
+read 7
+RoleId_30 7
+hide 7
+str 6
+download 6
+messag 6
+islice 6
+broken 6
+jgscm 6
+bcontent 6
+gcs 6
+ospath 6
+found 6
+pipe 6
+this 6
+utf 6
+blah 6
+data 6
+request 6
+valid 6
+datetim 6
+encode 5
+files 5
+asstr 5
+url 5
+epipe 5
+set 5
+delimit 5
+member 5
+value 5
+raises 5
+staticmethod 5
+info 5
+bool 5
+except 5
+such 5
+afile 5
+ofclass 5
+with 5
+run 5
+python 5
+nosuch 4
+blahblah 4
+cls 4
+languag 4
+program 4
+saving 4
+untitl 4
+encod 4
+reads 4
+builds 4
+bad 4
+decode 4
+version 4
+slash 4
+adirectori 4
+bytes 4
+tornado 4
+plain 4
+used 4
+licens 4
+kwargs 4
+sfor 4
+tuple 3
+notebooknod 3
+servic 3
+gsclient 3
+xipynb 3
+forbidden 3
+current 3
+ascii 3
+should 3
+one 3
+dumps 3
+tmpl 3
+stream 3
+keyfil 3
+anotebook 3
+generic 3
+you 3
+RoleId_52 3
+replac 3
+RoleId_34 3
+ifcont 3
+len 3
+output 3
+isrequest 3
+which 3
+fold 3
+main 3
+force 3
+node 3
+and 3
+unescap 3
+author 3
+rsplit 3
+vadim 2
+sfrom 2
+develop 2
+apath 2
+github 2
+restor 2
+raw 2
+tokeep 2
+traitlet 2
+copy 2
+beencod 2
+reason 2
+jupyt 2
+part 2
+com 2
+socket 2
+djgscm 2
+non 2
+provid 2
+fmt 2
+anoth 2
+sys 2
+togcs 2
+ifbase 2
+requir 2
+execut 2
+exc 2
+paramet 2
+form 2
+gcloud 2
+two 2
+any 2
+bydefault 2
+setup 2
+src 2
+RoleId_51 2
+packag 2
+alist 2
+keep 2
+tothe 2
+key 2
+atthe 2
+descript 2
+wrap 2
+decodebyt 2
+isnot 2
+nofile 2
+while 2
+int 2
+includ 2
+append 2
+convert 2
+check 2
+bedecod 2
+case 2
+asbase 2
+attribut 2
+asutf 2
+popul 2
+classmethod 2
+amodel 2
+mit 2
+ifyou 2
+iftext 2
+decod 2
+orbase 2
+anon 2
+ifnot 2
+donot 2
+https 2
+can 2
+just 2
+adict 2
+split 2
+exist 2
+octet 2
+properti 2
+cells 2
+join 2
+state 2
+may 2
+sub 2
+mime 2
+encodebyt 2
+gets 2
+ext 2
+mixin 2
+bbytes 2
+splitext 2
+only 1
+asingl 1
+thereof 1
+relat 1
+single 1
+mark 1
+dot 1
+root 1
+sown 1
+tochang 1
+script 1
+unhandl 1
+call 1
+tosplit 1
+inwhich 1
+super 1
+where 1
+inside 1
+asconvert 1
+rfind 1
+interpret 1
+bepopul 1
+ordirectori 1
+use 1
+process 1
+ingcs 1
+try 1
+beused 1
+itexist 1
+redefin 1
+end 1
+snew 1
+writes 1
+always 1
+same 1
+called 1
+api 1
+uses 1
+revers 1
+ralreadi 1
+pre 1
+sname 1
+RoleId_44 1
+nbconvert 1
+jsone 1
+ifformat 1
+ifthe 1
+repr 1
+desktop 1
+splits 1
+count 1
+into 1
+approv 1
+ifunicod 1
+upclass 1
+ajson 1
+unexpect 1
+importstr 1
+ifpath 1
+setuptool 1
+level 1
+reader 1
+tointerpret 1
+serial 1
+jgcsm 1
+miss 1
+change 1
+items 1
+bysave 1
+failed 1
+double 1
+asunicod 1
+isunknown 1
+interact 1
+start 1
+when 1
+beeither 1
+time 1
+callabl 1
+itertool 1
+own 1
+instal 1
+iffals 1
+behandl 1
+librari 1
+indic 1
+status 1
+noconvert 1
+sort 1
+needed 1
+faster 1
+either 1
+becal 1
+someth 1
+isneed 1
+tear 1
+via 1
+unknown 1
+specifi 1
+open 1
+RoleId_98 1
+alreadi 1
+classifi 1
+consid 1
+html 1
+handl 1
+oper 1
+agiven 1
+retriev 1
+empty 1
+orhtml 1
+ortupl 1
+tocach 1
+down 1
+next 1
+acont 1
+offile 1
+names 1
+tonew 1
+extract 1
+saved 1
+pick 1
+touse 1
+greater 1
+ashidden 1
+limit 1
+nbclass 1
+agener 1
+but 1
+was 1
+namespac 1
+ascript 1
+intern 1
+spath 1
+defin 1
+orimportstr 1
+loads 1
+update 1
+email 1
+stdout 1
+metadata 1
+object 1
+astext 1
+softwar 1
+users 1
+intend 1
+otherwis 1
+context 1
+structur 1
+readme 1
+toprocess 1
+apart 1
+cell 1
+dirnam 1
+unittest 1
+ifempti 1
+sign 1
+isused 1
+ofcheckpoint 1
+like 1
+toopen 1
+asvers 1
+keyword 1
+ondisk 1
+ingoogl 1
+onthe 1
+isipynb 1
+disk 1
+markovtsev 1
+explicit 1
+must 1
+common 1
+osi 1
+topic 1
+whether 1
+RoleId_111 1
+code 1
+iftrue 1
+trust 1
+ipython 1
+errors 1
+ofnbformat 1
+wtf 1
+input 1
+RoleId_84 1
+ifdefin 1
+collaps 1
+tech 1
+sourc 1
+nump 1
+given 1
+account 1
+escape 1
+audienc 1
+alpha 1
\ No newline at end of file
diff --git a/role2vec/utils.py b/role2vec/utils.py
new file mode 100644
index 0000000..c71092c
--- /dev/null
+++ b/role2vec/utils.py
@@ -0,0 +1,52 @@
+from itertools import islice
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+
+def node_iterator(root):
+    """
+    Enumerate UAST nodes using depth-first approach.
+    """
+    queue = [(root, 0)]
+    n_nodes = 1
+    while queue:
+        node, node_idx = queue.pop()
+        yield node, node_idx
+        for child in node.children:
+            queue.append((child, n_nodes))
+            n_nodes += 1
+
+
+def read_embeddings(emb_path: str) -> Tuple[Dict[str, np.array], List[str]]:
+    emb = {}
+    roles = []
+
+    with open(emb_path) as fin:
+        for line in fin:
+            word, *vec = line.split("\t")
+            emb[word] = np.array(vec, dtype=np.float)
+            if word.startswith("RoleId_"):
+                roles.append(word)
+
+    roles = {role: i for i, role in enumerate(roles)}
+    return emb, roles
+
+
+def read_paths(fname: str) -> List[str]:
+    with open(fname) as fin:
+        paths = [line.strip() for line in fin.readlines()]
+    if not paths:
+        raise ValueError("Make sure the file is not empty!")
+    return paths
+
+
+def read_vocab(vocab_path: str, num_words: int=None) -> List[str]:
+    with open(vocab_path) as fin:
+        words = [line.split(" ")[0] for line in islice(fin, num_words)]
+    return words
+
+
+def save_vocab(vocab_path: str, vocab: Dict[str, int]) -> None:
+    with open(vocab_path, "w") as fout:
+        fout.write("\n".join(map(lambda x: "%s %d" % x, vocab.most_common())))
diff --git a/role2vec/vocab.py b/role2vec/vocab.py
new file mode 100644
index 0000000..dd85d3a
--- /dev/null
+++ b/role2vec/vocab.py
@@ -0,0 +1,68 @@
+from collections import Counter
+from typing import Dict, List
+
+from ast2vec.token_parser import TokenParser
+from ast2vec.uast import UASTModel
+from role2vec.map_reduce import MapReduce
+from role2vec.utils import node_iterator, read_paths, save_vocab
+
+
+class Vocab(MapReduce):
+    """
+    Collects vocabulary from UASTs.
+    """
+
+    def __init__(self, log_level: str, num_processes: int):
+        """
+        :param log_level: Log level of Vocab.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        """
+        super(Vocab, self).__init__(log_level=log_level, num_processes=num_processes)
+        self.token_parser = TokenParser()
+
+    def create(self, files: List[str]) -> Dict[str, int]:
+        """
+        Create vocabulary by processing supplied UASTs.
+
+        :param files: List of filepaths to stored UASTs.
+        :return: Dict with tokens and their number of occurrences.
+        """
+        vocab = Counter()
+
+        @MapReduce.wrap_queue_in
+        def uasts_vocab(self, filename):
+            uast_model = UASTModel().load(filename)
+            tokens = Counter()
+            for uast in uast_model.uasts:
+                for node, _ in node_iterator(uast):
+                    tokens.update(self._get_tokens(node))
+            return tokens
+
+        @MapReduce.wrap_queue_out()
+        def combine_vocab(self, result):
+            nonlocal vocab
+            vocab.update(result)
+
+        self.parallelize(files, uasts_vocab, combine_vocab)
+        return vocab
+
+    def _get_log_name(self):
+        return "Vocab"
+
+    def _get_tokens(self, uast_node) -> List[str]:
+        """
+        Return node tokens.
+
+        :param uast_node: UAST node.
+        :return: List of tokens.
+        """
+        return ["RoleId_%d" % role for role in uast_node.roles] + \
+            list(self.token_parser.process_token(uast_node.token))
+
+
+def vocab_entry(args):
+    uasts = read_paths(args.input)
+    vocab = Vocab(args.log_level, args.processes)
+    words = vocab.create(uasts)
+    save_vocab(args.output, words)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..2a179ed
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+import sys
+
+from setuptools import setup, find_packages
+
+if sys.version_info < (3, 5, 0):
+    typing = ["typing"]
+else:
+    typing = []
+
+setup(
+    name="role2vec",
+    description="Part of source{d}'s stack for machine learning on source code. Provides API and "
+                "tools to train and use models for role prediction of UAST nodes extracted from "
+                "Babelfish.",
+    version="0.0.1-alpha",
+    license="Apache 2.0",
+    author="source{d}",
+    author_email="machine-learning@sourced.tech",
+    url="https://github.com/src-d/role2vec",
+    download_url="https://github.com/src-d/role2vec",
+    packages=find_packages(exclude=("role2vec.tests",)),
+    entry_points={
+        "console_scripts": ["role2vec=role2vec.__main__:main"],
+    },
+    keywords=["machine learning on source code", "word2vec", "id2vec",
+              "github", "swivel", "nbow", "bblfsh", "babelfish"],
+    install_requires=["ast2vec[tf]>=0.3.4-alpha", "scikit-learn>=0.19.0"] + typing,
+    package_data={"": ["LICENSE", "README.md"]},
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Environment :: Console",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: POSIX",
+        "Programming Language :: Python :: 3.4",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Topic :: Software Development :: Libraries"
+    ]
+)