src-d · fineguy · Aug 24, 2017 · Sep 5, 2017 · Sep 5, 2017 · Sep 5, 2017
diff --git a/.gitignore b/.gitignore
@@ -18,9 +18,3 @@
 # jupyter notebooks
 *.ipynb_checkpoints/
 
-# virtualenv
-role2vec/*
-
-# python libraries
-ast2vec/*
-vecino/*
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,42 @@
+language: python
+sudo: false
+dist: trusty
+services:
+- docker
+cache:
+  directories:
+  - "$HOME/.cache/pip"
+addons:
+  apt:
+    packages:
+      - libboost-all-dev
+      - libxml2-dev
+_install: &_install
+  - gimme 1.8
+  - source ~/.gimme/envs/latest.env
+  - pip install --upgrade pip
+  - pip install -r requirements.txt codecov
+  - pip install -e .
+_coverage: &_coverage
+  - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
+matrix:
+  include:
+    - python: 3.4
+      env: *_coverage
+      install: *_install
+    - python: 3.5
+      env: *_coverage
+      install: *_install
+    - python: 3.6
+      env: SCRIPT="pep8 --max-line-length=99 ."
+      install: pip install pep8
+    - python: 3.6
+      env: *_coverage
+      install: *_install
+      after_success:
+        - codecov
+  fast_finish: true
+script:
+- (eval "$SCRIPT")
+notifications:
+  email: false
diff --git a/README.md b/README.md
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+ast2vec[tf]>=0.3.4-alpha
+scikit-learn>=0.19.0
diff --git a/role2vec/__init__.py b/role2vec/__init__.py
diff --git a/role2vec/__main__.py b/role2vec/__main__.py
@@ -0,0 +1,114 @@
+import argparse
+import logging
+import sys
+
+from ast2vec.__main__ import ArgumentDefaultsHelpFormatterNoNone, one_arg_parser
+from modelforge.logs import setup_logging
+from role2vec.glove import glove_entry
+from role2vec.node2vec import node2vec_entry
+from role2vec.stats import stats_entry
+from role2vec.vocab import vocab_entry
+from role2vec.roles.base import ROLES_MODELS, roles_entry
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Create main parser.
+
+    :return: Parser
+    """
+    parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone)
+    parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel,
+                        help="Logging verbosity.")
+
+    # Create all common arguments
+
+    process_arg = one_arg_parser("--processes", type=int, default=2, help="Number of processes.")
+    vocab_arg = one_arg_parser("--vocabulary", default="vocab.txt", help="File with vocabulary.")
+    uast_input_arg = one_arg_parser("input", help="Input file with UASTs.")
+
+    # Construct subparsers
+
+    subparsers = parser.add_subparsers(help="Commands", dest="command")
+
+    glove_parser = subparsers.add_parser(
+        "glove", help="Convert proximity matrices into GloVe suitable format. Refer to "
+        "https://github.com/stanfordnlp/GloVe",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, vocab_arg])
+    glove_parser.set_defaults(handler=glove_entry)
+    glove_parser.add_argument("input", help="Input directory with proximity matrices.")
+    glove_parser.add_argument("output", help="Path to store combined proximity matrix.")
+    glove_parser.add_argument("--filter", default="**/*.asdf", help="File name glob selector.")
+
+    node2vec_parser = subparsers.add_parser(
+        "node2vec", help="Node2Vec random walk algorithm for assembling proximity matrices from "
+        "UASTs. Refer to https://github.com/aditya-grover/node2vec",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, vocab_arg, uast_input_arg])
+    node2vec_parser.set_defaults(handler=node2vec_entry)
+    node2vec_parser.add_argument("output", help="Path to store the resulting matrices.")
+    node2vec_parser.add_argument(
+        "-n", "--num-walks", type=int, default=1, help="Number of random walks from each node.")
+    node2vec_parser.add_argument(
+        "-l", "--walk-length", type=int, default=80, help="Length of each random walk.")
+    node2vec_parser.add_argument(
+        "-w", "--window", type=int, default=5, help="Window size for node context.")
+    node2vec_parser.add_argument(
+        "-p", type=float, default=1.0,
+        help="Controls the likelihood of immediately revisiting previous node.")
+    node2vec_parser.add_argument(
+        "-q", type=float, default=1.0, help="Controls the likelihood of exploring outward nodes.")
+
+    roles_parser = subparsers.add_parser(
+        "mlp", help="Train/test roles prediction model.",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg])
+    roles_parser.set_defaults(handler=roles_entry)
+    roles_parser.add_argument(
+        "algorithm", choices=ROLES_MODELS.keys(), help="Specify training algorithm.")
+    roles_parser.add_argument("--train", help="Input file with UASTs for training.")
+    roles_parser.add_argument("--test", help="Input file with UASTs for testing.")
+    roles_parser.add_argument("--model", required=True, help="Path to store trained model.")
+    roles_parser.add_argument(
+        "--embeddings", required=True, help="File with roles and tokens embeddings.")
+
+    stats_parser = subparsers.add_parser(
+        "stats", help="Collect statistics for number of nodes w.r.t. number of node roles in "
+        "UASTs.", formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, uast_input_arg])
+    stats_parser.set_defaults(handler=stats_entry)
+    stats_parser.add_argument("--stat", required=True, help="Path to store resulting statisics.")
+    stats_parser.add_argument("--susp", required=True, help="Path to store suspicious UASTs.")
+
+    vocab_parser = subparsers.add_parser(
+        "vocab", help="Collect vocabulary from UASTs.",
+        formatter_class=ArgumentDefaultsHelpFormatterNoNone,
+        parents=[process_arg, uast_input_arg])
+    vocab_parser.set_defaults(handler=vocab_entry)
+    vocab_parser.add_argument("output", default="vocab.txt", help="Path to store vocabulary.")
+
+    return parser
+
+
+def main():
+    """
+    Create all the argparsers and invoke the function from set_defaults().
+
+    :return: The result of the function from set_defaults().
+    """
+    parser = get_parser()
+    args = parser.parse_args()
+    args.log_level = logging._nameToLevel[args.log_level]
+    setup_logging(args.log_level)
+    try:
+        handler = args.handler
+    except AttributeError:
+        def print_usage(_):
+            parser.print_usage()
+
+        handler = print_usage
+    return handler(args)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/role2vec/glove.py b/role2vec/glove.py
@@ -0,0 +1,91 @@
+from collections import Counter
+from pathlib import Path
+import struct
+from typing import Dict, List, Tuple
+
+from ast2vec.coocc import Cooccurrences
+from role2vec.map_reduce import MapReduce
+from role2vec.utils import read_vocab
+
+
+class GloVe(MapReduce):
+    """
+    Converts proximity matrices into GloVe suitable format.
+    Refer to https://github.com/stanfordnlp/GloVe
+    """
+
+    def __init__(self, log_level: str, num_processes: int, vocab_path: str):
+        """
+        :param log_level: Log level of GloVe.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param vocab_path: Path to stored vocabulary.
+        """
+        super(GloVe, self).__init__(log_level=log_level, num_processes=num_processes)
+        self.vocab = {word: i for i, word in enumerate(read_vocab(vocab_path))}
+
+    def convert(self, src_dir: str, output: str, file_filter: str) -> None:
+        """
+        Combine all proximity matrices and save them into GloVe suitable format.
+
+        :param src_dir: Path to stored proximity matrices.
+        :param output: Path for storing the resulting GloVe suitable matrix.
+        :param file_filter: Pattern for recursively scanning `src_dir`.
+        """
+        self._log.info("Scanning %s", src_dir)
+        files = [str(p) for p in Path(src_dir).glob(file_filter)]
+        self._log.info("Found %d files", len(files))
+        if not files:
+            return 0
+
+        self._log.info("Combine proximity matrices.")
+        mat = self.combine_mats(files)
+        self._log.info("Finished combining.")
+
+        self._log.info("Saving matrix.")
+        self.save_mat(mat, output)
+
+    def combine_mats(self, files: List[str]) -> Dict[Tuple[str, str], int]:
+        """
+        Combine proximity matrices.
+
+        :param files: List of filepaths to stored proximity matrices.
+        :return: Mapping from token pairs to their proximity combined over all matrices.
+        """
+        counter = Counter()
+
+        @MapReduce.wrap_queue_in
+        def process_prox(self, filename):
+            prox = Cooccurrences().load(filename)
+            return {(prox.tokens[i], prox.tokens[j]): val for
+                    i, j, val in zip(prox.matrix.row, prox.matrix.col, prox.matrix.data)}
+
+        @MapReduce.wrap_queue_out()
+        def combine_prox(result):
+            nonlocal counter
+            counter.update(
+                {(self.vocab[i], self.vocab[j]): val for (i, j), val in result.items()
+                 if i in self.vocab and j in self.vocab})
+
+        self.parallelize(files, process_prox, combine_prox)
+        return counter
+
+    @staticmethod
+    def save_mat(mat: Dict[Tuple[str, str], int], output: str) -> None:
+        """
+        Save matrix in GloVe suitable format.
+
+        :param mat: Counter storing proximities.
+        :param output: Path for storing the resulting GloVe suitable matrix.
+        """
+        with open(output, "wb") as fout:
+            for (i, j), val in mat.items():
+                fout.write(struct.pack("iid", i, j, int(val)))
+
+    def _get_log_name(self):
+        return "GloVe"
+
+
+def glove_entry(args):
+    glove = GloVe(args.log_level, args.processes, args.vocabulary)
+    glove.convert(args.input, args.output, args.filter)
diff --git a/role2vec/map_reduce.py b/role2vec/map_reduce.py
@@ -0,0 +1,111 @@
+import multiprocessing
+import time
+from typing import List
+
+from ast2vec.pickleable_logger import PickleableLogger
+
+
+class MapReduce(PickleableLogger):
+    """
+    Base class for parallel data processign. Creates a pool of workers for data mangling and
+    reduces data in the main process.
+    """
+
+    def __init__(self, log_level: str, num_processes: int, queue_lim: int=100):
+        """
+        :param log_level: Log level of MapReduce.
+        :param num_processes: Number of running processes. There's always one additional process
+                              for reducing data.
+        :param queue_lim: Maximum number of results in queue for reducing.
+        """
+        super(MapReduce, self).__init__(log_level=log_level)
+        self.num_processes = num_processes
+        self.queue_lim = queue_lim
+
+    def parallelize(self, tasks: List[str], process_queue_in, process_queue_out) -> int:
+        """
+        Process tasks in parallel.
+
+        :param tasks: List of filenames.
+        :param process_queue_in: Function for processing items from the task queue.
+        :param process_queue_out: Function for processing items from the result queue.
+        :return: Number of failed tasks.
+        """
+        queue_in = multiprocessing.Manager().Queue()
+        queue_out = multiprocessing.Manager().Queue(self.queue_lim)
+        processes = [multiprocessing.Process(target=process_queue_in,
+                                             args=(self, queue_in, queue_out))
+                     for i in range(self.num_processes)]
+        n_tasks = len(tasks)
+        start_time = time.time()
+
+        self._log.info("Starting tasks.")
+        for p in processes:
+            p.start()
+        for t in tasks:
+            queue_in.put(t)
+        for _ in processes:
+            queue_in.put(None)
+
+        failures = process_queue_out(self, n_tasks, queue_out)
+        for p in processes:
+            p.join()
+
+        self._log.info("Finished %d/%d tasks in %.2f" %
+                       (n_tasks - failures, n_tasks, time.time() - start_time))
+        return len(tasks) - failures
+
+    @staticmethod
+    def wrap_queue_in(func):
+        """
+        Wrapper for automatic quering of tasks and storing results in the result queue.
+
+        :param func: Function that can process a single task and accepts `self` as parameter.
+        """
+        def wrapper(self, queue_in, queue_out):
+            while True:
+                item = queue_in.get()
+                if item is None:
+                    break
+                try:
+                    queue_out.put(func(self, item))
+                except:
+                    self._log.exception("%s failed", item)
+                    queue_out.put(None)
+        return wrapper
+
+    @staticmethod
+    def wrap_queue_out(freq: int=1000):
+        """
+        Wrapper for allowing parametrization.
+
+        :param freq: Logs information every `freq` iterations.
+        """
+        def outer_wrapper(func):
+            """
+            Wrapper for automatic quering of results and reducing them.
+
+            :param func: Function that can process a result and accepts `self` as parameter.
+            """
+            def wrapper(self, n_tasks, queue_out):
+                failures = 0
+                start = time.time()
+
+                for i in range(n_tasks):
+                    result = queue_out.get()
+                    if (i + 1) % freq == 0:
+                        self._log.info("Processed %d/%d in %.2f" %
+                                       (i + 1, n_tasks, time.time() - start))
+                    if result is None:
+                        failures += 1
+                        continue
+                    func(self, result)
+
+                self._log.info("Finished %d/%d in %.2f seconds" %
+                               (i + 1, n_tasks, time.time() - start))
+                return failures
+            return wrapper
+        return outer_wrapper
+
+    def _get_log_name(self):
+        return "MapReduce"