Skip to content
This repository was archived by the owner on Aug 5, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,3 @@
# jupyter notebooks
*.ipynb_checkpoints/

# virtualenv
role2vec/*

# python libraries
ast2vec/*
vecino/*
42 changes: 42 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
language: python
sudo: false
dist: trusty
services:
- docker
cache:
directories:
- "$HOME/.cache/pip"
addons:
apt:
packages:
- libboost-all-dev
- libxml2-dev
_install: &_install
- gimme 1.8
- source ~/.gimme/envs/latest.env
- pip install --upgrade pip
- pip install -r requirements.txt codecov
- pip install -e .
_coverage: &_coverage
- SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
matrix:
include:
- python: 3.4
env: *_coverage
install: *_install
- python: 3.5
env: *_coverage
install: *_install
- python: 3.6
env: SCRIPT="pep8 --max-line-length=99 ."
install: pip install pep8
- python: 3.6
env: *_coverage
install: *_install
after_success:
- codecov
fast_finish: true
script:
- (eval "$SCRIPT")
notifications:
email: false
Empty file added README.md
Empty file.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ast2vec[tf]>=0.3.4-alpha
scikit-learn>=0.19.0
Empty file added role2vec/__init__.py
Empty file.
114 changes: 114 additions & 0 deletions role2vec/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import argparse
import logging
import sys

from ast2vec.__main__ import ArgumentDefaultsHelpFormatterNoNone, one_arg_parser
from modelforge.logs import setup_logging
from role2vec.glove import glove_entry
from role2vec.node2vec import node2vec_entry
from role2vec.stats import stats_entry
from role2vec.vocab import vocab_entry
from role2vec.roles.base import ROLES_MODELS, roles_entry


def get_parser() -> argparse.ArgumentParser:
"""
Create main parser.

:return: Parser
"""
parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone)
parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel,
help="Logging verbosity.")

# Create all common arguments

process_arg = one_arg_parser("--processes", type=int, default=2, help="Number of processes.")
vocab_arg = one_arg_parser("--vocabulary", default="vocab.txt", help="File with vocabulary.")
uast_input_arg = one_arg_parser("input", help="Input file with UASTs.")

# Construct subparsers

subparsers = parser.add_subparsers(help="Commands", dest="command")

glove_parser = subparsers.add_parser(
"glove", help="Convert proximity matrices into GloVe suitable format. Refer to "
"https://github.com/stanfordnlp/GloVe",
formatter_class=ArgumentDefaultsHelpFormatterNoNone,
parents=[process_arg, vocab_arg])
glove_parser.set_defaults(handler=glove_entry)
glove_parser.add_argument("input", help="Input directory with proximity matrices.")
glove_parser.add_argument("output", help="Path to store combined proximity matrix.")
glove_parser.add_argument("--filter", default="**/*.asdf", help="File name glob selector.")

node2vec_parser = subparsers.add_parser(
"node2vec", help="Node2Vec random walk algorithm for assembling proximity matrices from "
"UASTs. Refer to https://github.com/aditya-grover/node2vec",
formatter_class=ArgumentDefaultsHelpFormatterNoNone,
parents=[process_arg, vocab_arg, uast_input_arg])
node2vec_parser.set_defaults(handler=node2vec_entry)
node2vec_parser.add_argument("output", help="Path to store the resulting matrices.")
node2vec_parser.add_argument(
"-n", "--num-walks", type=int, default=1, help="Number of random walks from each node.")
node2vec_parser.add_argument(
"-l", "--walk-length", type=int, default=80, help="Length of each random walk.")
node2vec_parser.add_argument(
"-w", "--window", type=int, default=5, help="Window size for node context.")
node2vec_parser.add_argument(
"-p", type=float, default=1.0,
help="Controls the likelihood of immediately revisiting previous node.")
node2vec_parser.add_argument(
"-q", type=float, default=1.0, help="Controls the likelihood of exploring outward nodes.")

roles_parser = subparsers.add_parser(
"mlp", help="Train/test roles prediction model.",
formatter_class=ArgumentDefaultsHelpFormatterNoNone,
parents=[process_arg])
roles_parser.set_defaults(handler=roles_entry)
roles_parser.add_argument(
"algorithm", choices=ROLES_MODELS.keys(), help="Specify training algorithm.")
roles_parser.add_argument("--train", help="Input file with UASTs for training.")
roles_parser.add_argument("--test", help="Input file with UASTs for testing.")
roles_parser.add_argument("--model", required=True, help="Path to store trained model.")
roles_parser.add_argument(
"--embeddings", required=True, help="File with roles and tokens embeddings.")

stats_parser = subparsers.add_parser(
"stats", help="Collect statistics for number of nodes w.r.t. number of node roles in "
"UASTs.", formatter_class=ArgumentDefaultsHelpFormatterNoNone,
parents=[process_arg, uast_input_arg])
stats_parser.set_defaults(handler=stats_entry)
stats_parser.add_argument("--stat", required=True, help="Path to store resulting statisics.")
stats_parser.add_argument("--susp", required=True, help="Path to store suspicious UASTs.")

vocab_parser = subparsers.add_parser(
"vocab", help="Collect vocabulary from UASTs.",
formatter_class=ArgumentDefaultsHelpFormatterNoNone,
parents=[process_arg, uast_input_arg])
vocab_parser.set_defaults(handler=vocab_entry)
vocab_parser.add_argument("output", default="vocab.txt", help="Path to store vocabulary.")

return parser


def main():
"""
Create all the argparsers and invoke the function from set_defaults().

:return: The result of the function from set_defaults().
"""
parser = get_parser()
args = parser.parse_args()
args.log_level = logging._nameToLevel[args.log_level]
setup_logging(args.log_level)
try:
handler = args.handler
except AttributeError:
def print_usage(_):
parser.print_usage()

handler = print_usage
return handler(args)

if __name__ == "__main__":
sys.exit(main())
91 changes: 91 additions & 0 deletions role2vec/glove.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from collections import Counter
from pathlib import Path
import struct
from typing import Dict, List, Tuple

from ast2vec.coocc import Cooccurrences
from role2vec.map_reduce import MapReduce
from role2vec.utils import read_vocab


class GloVe(MapReduce):
"""
Converts proximity matrices into GloVe suitable format.
Refer to https://github.com/stanfordnlp/GloVe
"""

def __init__(self, log_level: str, num_processes: int, vocab_path: str):
"""
:param log_level: Log level of GloVe.
:param num_processes: Number of running processes. There's always one additional process
for reducing data.
:param vocab_path: Path to stored vocabulary.
"""
super(GloVe, self).__init__(log_level=log_level, num_processes=num_processes)
self.vocab = {word: i for i, word in enumerate(read_vocab(vocab_path))}

def convert(self, src_dir: str, output: str, file_filter: str) -> None:
"""
Combine all proximity matrices and save them into GloVe suitable format.

:param src_dir: Path to stored proximity matrices.
:param output: Path for storing the resulting GloVe suitable matrix.
:param file_filter: Pattern for recursively scanning `src_dir`.
"""
self._log.info("Scanning %s", src_dir)
files = [str(p) for p in Path(src_dir).glob(file_filter)]
self._log.info("Found %d files", len(files))
if not files:
return 0

self._log.info("Combine proximity matrices.")
mat = self.combine_mats(files)
self._log.info("Finished combining.")

self._log.info("Saving matrix.")
self.save_mat(mat, output)

def combine_mats(self, files: List[str]) -> Dict[Tuple[str, str], int]:
"""
Combine proximity matrices.

:param files: List of filepaths to stored proximity matrices.
:return: Mapping from token pairs to their proximity combined over all matrices.
"""
counter = Counter()

@MapReduce.wrap_queue_in
def process_prox(self, filename):
prox = Cooccurrences().load(filename)
return {(prox.tokens[i], prox.tokens[j]): val for
i, j, val in zip(prox.matrix.row, prox.matrix.col, prox.matrix.data)}

@MapReduce.wrap_queue_out()
def combine_prox(result):
nonlocal counter
counter.update(
{(self.vocab[i], self.vocab[j]): val for (i, j), val in result.items()
if i in self.vocab and j in self.vocab})

self.parallelize(files, process_prox, combine_prox)
return counter

@staticmethod
def save_mat(mat: Dict[Tuple[str, str], int], output: str) -> None:
"""
Save matrix in GloVe suitable format.

:param mat: Counter storing proximities.
:param output: Path for storing the resulting GloVe suitable matrix.
"""
with open(output, "wb") as fout:
for (i, j), val in mat.items():
fout.write(struct.pack("iid", i, j, int(val)))

def _get_log_name(self):
return "GloVe"


def glove_entry(args):
glove = GloVe(args.log_level, args.processes, args.vocabulary)
glove.convert(args.input, args.output, args.filter)
111 changes: 111 additions & 0 deletions role2vec/map_reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import multiprocessing
import time
from typing import List

from ast2vec.pickleable_logger import PickleableLogger


class MapReduce(PickleableLogger):
"""
Base class for parallel data processign. Creates a pool of workers for data mangling and
reduces data in the main process.
"""

def __init__(self, log_level: str, num_processes: int, queue_lim: int=100):
"""
:param log_level: Log level of MapReduce.
:param num_processes: Number of running processes. There's always one additional process
for reducing data.
:param queue_lim: Maximum number of results in queue for reducing.
"""
super(MapReduce, self).__init__(log_level=log_level)
self.num_processes = num_processes
self.queue_lim = queue_lim

def parallelize(self, tasks: List[str], process_queue_in, process_queue_out) -> int:
"""
Process tasks in parallel.

:param tasks: List of filenames.
:param process_queue_in: Function for processing items from the task queue.
:param process_queue_out: Function for processing items from the result queue.
:return: Number of failed tasks.
"""
queue_in = multiprocessing.Manager().Queue()
queue_out = multiprocessing.Manager().Queue(self.queue_lim)
processes = [multiprocessing.Process(target=process_queue_in,
args=(self, queue_in, queue_out))
for i in range(self.num_processes)]
n_tasks = len(tasks)
start_time = time.time()

self._log.info("Starting tasks.")
for p in processes:
p.start()
for t in tasks:
queue_in.put(t)
for _ in processes:
queue_in.put(None)

failures = process_queue_out(self, n_tasks, queue_out)
for p in processes:
p.join()

self._log.info("Finished %d/%d tasks in %.2f" %
(n_tasks - failures, n_tasks, time.time() - start_time))
return len(tasks) - failures

@staticmethod
def wrap_queue_in(func):
"""
Wrapper for automatic quering of tasks and storing results in the result queue.

:param func: Function that can process a single task and accepts `self` as parameter.
"""
def wrapper(self, queue_in, queue_out):
while True:
item = queue_in.get()
if item is None:
break
try:
queue_out.put(func(self, item))
except:
self._log.exception("%s failed", item)
queue_out.put(None)
return wrapper

@staticmethod
def wrap_queue_out(freq: int=1000):
"""
Wrapper for allowing parametrization.

:param freq: Logs information every `freq` iterations.
"""
def outer_wrapper(func):
"""
Wrapper for automatic quering of results and reducing them.

:param func: Function that can process a result and accepts `self` as parameter.
"""
def wrapper(self, n_tasks, queue_out):
failures = 0
start = time.time()

for i in range(n_tasks):
result = queue_out.get()
if (i + 1) % freq == 0:
self._log.info("Processed %d/%d in %.2f" %
(i + 1, n_tasks, time.time() - start))
if result is None:
failures += 1
continue
func(self, result)

self._log.info("Finished %d/%d in %.2f seconds" %
(i + 1, n_tasks, time.time() - start))
return failures
return wrapper
return outer_wrapper

def _get_log_name(self):
return "MapReduce"
Loading