diff --git a/.gitignore b/.gitignore index 5c8fe09..491c84c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,9 +18,3 @@ # jupyter notebooks *.ipynb_checkpoints/ -# virtualenv -role2vec/* - -# python libraries -ast2vec/* -vecino/* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..eca0afa --- /dev/null +++ b/.travis.yml @@ -0,0 +1,42 @@ +language: python +sudo: false +dist: trusty +services: +- docker +cache: + directories: + - "$HOME/.cache/pip" +addons: + apt: + packages: + - libboost-all-dev + - libxml2-dev +_install: &_install + - gimme 1.8 + - source ~/.gimme/envs/latest.env + - pip install --upgrade pip + - pip install -r requirements.txt codecov + - pip install -e . +_coverage: &_coverage + - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine" +matrix: + include: + - python: 3.4 + env: *_coverage + install: *_install + - python: 3.5 + env: *_coverage + install: *_install + - python: 3.6 + env: SCRIPT="pep8 --max-line-length=99 ." + install: pip install pep8 + - python: 3.6 + env: *_coverage + install: *_install + after_success: + - codecov + fast_finish: true +script: +- (eval "$SCRIPT") +notifications: + email: false diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1d6a773 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +ast2vec[tf]>=0.3.4-alpha +scikit-learn>=0.19.0 \ No newline at end of file diff --git a/role2vec/__init__.py b/role2vec/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/role2vec/__main__.py b/role2vec/__main__.py new file mode 100644 index 0000000..fedcf49 --- /dev/null +++ b/role2vec/__main__.py @@ -0,0 +1,114 @@ +import argparse +import logging +import sys + +from ast2vec.__main__ import ArgumentDefaultsHelpFormatterNoNone, one_arg_parser +from modelforge.logs import setup_logging +from role2vec.glove import glove_entry +from role2vec.node2vec import node2vec_entry +from role2vec.stats import stats_entry +from role2vec.vocab import vocab_entry +from role2vec.roles.base import ROLES_MODELS, roles_entry + + +def get_parser() -> argparse.ArgumentParser: + """ + Create main parser. + + :return: Parser + """ + parser = argparse.ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatterNoNone) + parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, + help="Logging verbosity.") + + # Create all common arguments + + process_arg = one_arg_parser("--processes", type=int, default=2, help="Number of processes.") + vocab_arg = one_arg_parser("--vocabulary", default="vocab.txt", help="File with vocabulary.") + uast_input_arg = one_arg_parser("input", help="Input file with UASTs.") + + # Construct subparsers + + subparsers = parser.add_subparsers(help="Commands", dest="command") + + glove_parser = subparsers.add_parser( + "glove", help="Convert proximity matrices into GloVe suitable format. Refer to " + "https://github.com/stanfordnlp/GloVe", + formatter_class=ArgumentDefaultsHelpFormatterNoNone, + parents=[process_arg, vocab_arg]) + glove_parser.set_defaults(handler=glove_entry) + glove_parser.add_argument("input", help="Input directory with proximity matrices.") + glove_parser.add_argument("output", help="Path to store combined proximity matrix.") + glove_parser.add_argument("--filter", default="**/*.asdf", help="File name glob selector.") + + node2vec_parser = subparsers.add_parser( + "node2vec", help="Node2Vec random walk algorithm for assembling proximity matrices from " + "UASTs. Refer to https://github.com/aditya-grover/node2vec", + formatter_class=ArgumentDefaultsHelpFormatterNoNone, + parents=[process_arg, vocab_arg, uast_input_arg]) + node2vec_parser.set_defaults(handler=node2vec_entry) + node2vec_parser.add_argument("output", help="Path to store the resulting matrices.") + node2vec_parser.add_argument( + "-n", "--num-walks", type=int, default=1, help="Number of random walks from each node.") + node2vec_parser.add_argument( + "-l", "--walk-length", type=int, default=80, help="Length of each random walk.") + node2vec_parser.add_argument( + "-w", "--window", type=int, default=5, help="Window size for node context.") + node2vec_parser.add_argument( + "-p", type=float, default=1.0, + help="Controls the likelihood of immediately revisiting previous node.") + node2vec_parser.add_argument( + "-q", type=float, default=1.0, help="Controls the likelihood of exploring outward nodes.") + + roles_parser = subparsers.add_parser( + "mlp", help="Train/test roles prediction model.", + formatter_class=ArgumentDefaultsHelpFormatterNoNone, + parents=[process_arg]) + roles_parser.set_defaults(handler=roles_entry) + roles_parser.add_argument( + "algorithm", choices=ROLES_MODELS.keys(), help="Specify training algorithm.") + roles_parser.add_argument("--train", help="Input file with UASTs for training.") + roles_parser.add_argument("--test", help="Input file with UASTs for testing.") + roles_parser.add_argument("--model", required=True, help="Path to store trained model.") + roles_parser.add_argument( + "--embeddings", required=True, help="File with roles and tokens embeddings.") + + stats_parser = subparsers.add_parser( + "stats", help="Collect statistics for number of nodes w.r.t. number of node roles in " + "UASTs.", formatter_class=ArgumentDefaultsHelpFormatterNoNone, + parents=[process_arg, uast_input_arg]) + stats_parser.set_defaults(handler=stats_entry) + stats_parser.add_argument("--stat", required=True, help="Path to store resulting statisics.") + stats_parser.add_argument("--susp", required=True, help="Path to store suspicious UASTs.") + + vocab_parser = subparsers.add_parser( + "vocab", help="Collect vocabulary from UASTs.", + formatter_class=ArgumentDefaultsHelpFormatterNoNone, + parents=[process_arg, uast_input_arg]) + vocab_parser.set_defaults(handler=vocab_entry) + vocab_parser.add_argument("output", default="vocab.txt", help="Path to store vocabulary.") + + return parser + + +def main(): + """ + Create all the argparsers and invoke the function from set_defaults(). + + :return: The result of the function from set_defaults(). + """ + parser = get_parser() + args = parser.parse_args() + args.log_level = logging._nameToLevel[args.log_level] + setup_logging(args.log_level) + try: + handler = args.handler + except AttributeError: + def print_usage(_): + parser.print_usage() + + handler = print_usage + return handler(args) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/role2vec/glove.py b/role2vec/glove.py new file mode 100644 index 0000000..beb01b5 --- /dev/null +++ b/role2vec/glove.py @@ -0,0 +1,91 @@ +from collections import Counter +from pathlib import Path +import struct +from typing import Dict, List, Tuple + +from ast2vec.coocc import Cooccurrences +from role2vec.map_reduce import MapReduce +from role2vec.utils import read_vocab + + +class GloVe(MapReduce): + """ + Converts proximity matrices into GloVe suitable format. + Refer to https://github.com/stanfordnlp/GloVe + """ + + def __init__(self, log_level: str, num_processes: int, vocab_path: str): + """ + :param log_level: Log level of GloVe. + :param num_processes: Number of running processes. There's always one additional process + for reducing data. + :param vocab_path: Path to stored vocabulary. + """ + super(GloVe, self).__init__(log_level=log_level, num_processes=num_processes) + self.vocab = {word: i for i, word in enumerate(read_vocab(vocab_path))} + + def convert(self, src_dir: str, output: str, file_filter: str) -> None: + """ + Combine all proximity matrices and save them into GloVe suitable format. + + :param src_dir: Path to stored proximity matrices. + :param output: Path for storing the resulting GloVe suitable matrix. + :param file_filter: Pattern for recursively scanning `src_dir`. + """ + self._log.info("Scanning %s", src_dir) + files = [str(p) for p in Path(src_dir).glob(file_filter)] + self._log.info("Found %d files", len(files)) + if not files: + return 0 + + self._log.info("Combine proximity matrices.") + mat = self.combine_mats(files) + self._log.info("Finished combining.") + + self._log.info("Saving matrix.") + self.save_mat(mat, output) + + def combine_mats(self, files: List[str]) -> Dict[Tuple[str, str], int]: + """ + Combine proximity matrices. + + :param files: List of filepaths to stored proximity matrices. + :return: Mapping from token pairs to their proximity combined over all matrices. + """ + counter = Counter() + + @MapReduce.wrap_queue_in + def process_prox(self, filename): + prox = Cooccurrences().load(filename) + return {(prox.tokens[i], prox.tokens[j]): val for + i, j, val in zip(prox.matrix.row, prox.matrix.col, prox.matrix.data)} + + @MapReduce.wrap_queue_out() + def combine_prox(result): + nonlocal counter + counter.update( + {(self.vocab[i], self.vocab[j]): val for (i, j), val in result.items() + if i in self.vocab and j in self.vocab}) + + self.parallelize(files, process_prox, combine_prox) + return counter + + @staticmethod + def save_mat(mat: Dict[Tuple[str, str], int], output: str) -> None: + """ + Save matrix in GloVe suitable format. + + :param mat: Counter storing proximities. + :param output: Path for storing the resulting GloVe suitable matrix. + """ + with open(output, "wb") as fout: + for (i, j), val in mat.items(): + fout.write(struct.pack("iid", i, j, int(val))) + + def _get_log_name(self): + return "GloVe" + + +def glove_entry(args): + glove = GloVe(args.log_level, args.processes, args.vocabulary) + glove.convert(args.input, args.output, args.filter) diff --git a/role2vec/map_reduce.py b/role2vec/map_reduce.py new file mode 100644 index 0000000..eb9dced --- /dev/null +++ b/role2vec/map_reduce.py @@ -0,0 +1,111 @@ +import multiprocessing +import time +from typing import List + +from ast2vec.pickleable_logger import PickleableLogger + + +class MapReduce(PickleableLogger): + """ + Base class for parallel data processign. Creates a pool of workers for data mangling and + reduces data in the main process. + """ + + def __init__(self, log_level: str, num_processes: int, queue_lim: int=100): + """ + :param log_level: Log level of MapReduce. + :param num_processes: Number of running processes. There's always one additional process + for reducing data. + :param queue_lim: Maximum number of results in queue for reducing. + """ + super(MapReduce, self).__init__(log_level=log_level) + self.num_processes = num_processes + self.queue_lim = queue_lim + + def parallelize(self, tasks: List[str], process_queue_in, process_queue_out) -> int: + """ + Process tasks in parallel. + + :param tasks: List of filenames. + :param process_queue_in: Function for processing items from the task queue. + :param process_queue_out: Function for processing items from the result queue. + :return: Number of failed tasks. + """ + queue_in = multiprocessing.Manager().Queue() + queue_out = multiprocessing.Manager().Queue(self.queue_lim) + processes = [multiprocessing.Process(target=process_queue_in, + args=(self, queue_in, queue_out)) + for i in range(self.num_processes)] + n_tasks = len(tasks) + start_time = time.time() + + self._log.info("Starting tasks.") + for p in processes: + p.start() + for t in tasks: + queue_in.put(t) + for _ in processes: + queue_in.put(None) + + failures = process_queue_out(self, n_tasks, queue_out) + for p in processes: + p.join() + + self._log.info("Finished %d/%d tasks in %.2f" % + (n_tasks - failures, n_tasks, time.time() - start_time)) + return len(tasks) - failures + + @staticmethod + def wrap_queue_in(func): + """ + Wrapper for automatic quering of tasks and storing results in the result queue. + + :param func: Function that can process a single task and accepts `self` as parameter. + """ + def wrapper(self, queue_in, queue_out): + while True: + item = queue_in.get() + if item is None: + break + try: + queue_out.put(func(self, item)) + except: + self._log.exception("%s failed", item) + queue_out.put(None) + return wrapper + + @staticmethod + def wrap_queue_out(freq: int=1000): + """ + Wrapper for allowing parametrization. + + :param freq: Logs information every `freq` iterations. + """ + def outer_wrapper(func): + """ + Wrapper for automatic quering of results and reducing them. + + :param func: Function that can process a result and accepts `self` as parameter. + """ + def wrapper(self, n_tasks, queue_out): + failures = 0 + start = time.time() + + for i in range(n_tasks): + result = queue_out.get() + if (i + 1) % freq == 0: + self._log.info("Processed %d/%d in %.2f" % + (i + 1, n_tasks, time.time() - start)) + if result is None: + failures += 1 + continue + func(self, result) + + self._log.info("Finished %d/%d in %.2f seconds" % + (i + 1, n_tasks, time.time() - start)) + return failures + return wrapper + return outer_wrapper + + def _get_log_name(self): + return "MapReduce" diff --git a/role2vec/node2vec.py b/role2vec/node2vec.py new file mode 100644 index 0000000..81071dc --- /dev/null +++ b/role2vec/node2vec.py @@ -0,0 +1,122 @@ +from collections import defaultdict +from itertools import product +import os +from typing import List + +import numpy +from scipy.sparse import coo_matrix, diags + +from ast2vec.coocc import Cooccurrences +from ast2vec.uast import UASTModel +from role2vec.map_reduce import MapReduce +from role2vec.random_walk import Graph +from role2vec.utils import read_paths, read_vocab + + +class Node2Vec(MapReduce): + """ + Uses Node2Vec random walk algorithm for assembling proximity matrices from UASTs. + Refer to https://github.com/aditya-grover/node2vec + """ + + MAX_VOCAB_WORDS = 1000000 + + def __init__(self, log_level: str, num_processes: int, vocab_path: str, window: int, + graph: Graph): + """ + :param log_level: Log level of Node2Vec. + :param num_processes: Number of running processes. There's always one additional process + for reducing data. + :param vocab_path: Path to stored vocabulary. + :param window: Context window size for collecting proximities. + :param graph: Graph object for random walks generation. + """ + super(Node2Vec, self).__init__(log_level=log_level, num_processes=num_processes) + self.graph = graph + self.vocab = {w: i for i, w in enumerate(read_vocab(vocab_path, Node2Vec.MAX_VOCAB_WORDS))} + self.window = window + + def process(self, fname: str, output_dir: str) -> None: + """ + Extract proximity matrices from UASTs. + + :param fname: Path to file with filepaths to stored UASTs. + :param output_dir: Path to directory for storing proximity matrices. + """ + self._log.info("Scanning %s", fname) + paths = read_paths(fname) + self._log.info("Found %d files", len(paths)) + + @MapReduce.wrap_queue_in + def process_uast(self, obj): + filename, output = obj + self._log.info("Processing %s", filename) + uast = UASTModel().load(filename) + dok_matrix = defaultdict(int) + + for walk in self.graph.simulate_walks(uast): + walk = [[self.vocab[t] for t in map(str, node.tokens) + if t in self.vocab] for node in walk] + # Connect each token to the next `self.window` tokens. + for i, cur_tokens in enumerate(walk[:-1]): + for next_tokens in walk[(i + 1):(i + self.window)]: + for word1, word2 in product(cur_tokens, next_tokens): + # Symmetry will be accounted for later + dok_matrix[(word1, word2)] += 1 + + del uast + + mat = coo_matrix( + (Node2Vec.MAX_VOCAB_WORDS, Node2Vec.MAX_VOCAB_WORDS), dtype=numpy.int32) + mat.row = row = numpy.empty(len(dok_matrix), dtype=numpy.int32) + mat.col = col = numpy.empty(len(dok_matrix), dtype=numpy.int32) + mat.data = data = numpy.empty(len(dok_matrix), dtype=numpy.int32) + for i, (coord, val) in enumerate(sorted(dok_matrix.items())): + row[i], col[i] = coord + data[i] = val + + del dok_matrix + # Accounting for symmetry + mat = coo_matrix(mat + mat.T - diags(mat.diagonal())) + + coocc = Cooccurrences() + coocc.construct(tokens=sorted(self.vocab, key=self.vocab.get), matrix=mat) + coocc.save(output) + self._log.info("Finished processing %s", filename) + return filename + + @MapReduce.wrap_queue_out() + def process_output(self, result): + pass + + self._log.info("Preprocessing file names.") + paths = self._preprocess_paths(paths, output_dir) + self.parallelize(paths, process_uast, process_output) + + def _get_log_name(self): + return "Node2Vec" + + def _preprocess_paths(self, paths: List[str], output_dir: str) -> List[str]: + """ + Prepare paths for storing proximity matrices. + + :param paths: List of filepaths to stored UASTs. + :param output_dir: Path to directory for storing proximity matrices. + :return: List of filepaths for storing proximity matrices. + """ + preprocessed_paths = [] + for p in paths: + name = os.path.basename(p) + if name.startswith("uast_"): + name = name[len("uast_"):] + out_dir = os.path.join(output_dir, name[0]) + os.makedirs(out_dir, exist_ok=True) + out_fname = os.path.join(out_dir, name) + preprocessed_paths.append((p, out_fname)) + return preprocessed_paths + + +def node2vec_entry(args): + graph = Graph(args.log_level, args.num_walks, args.walk_length, args.p, args.q) + node2vec = Node2Vec(args.log_level, args.processes, args.vocabulary, args.window, graph) + node2vec.process(args.input, args.output) diff --git a/role2vec/random_walk.py b/role2vec/random_walk.py new file mode 100644 index 0000000..3df5dbd --- /dev/null +++ b/role2vec/random_walk.py @@ -0,0 +1,187 @@ +from collections import namedtuple +import random +from typing import Dict, Iterator, List, Tuple + +import numpy as np + +from ast2vec.pickleable_logger import PickleableLogger +from ast2vec.token_parser import TokenParser +from role2vec.utils import node_iterator + +GraphNode = namedtuple("GraphNode", ["id", "neighbors", "tokens"]) + + +class Graph(PickleableLogger): + """ + Generates random walks from UASTs. + """ + + def __init__(self, log_level: str, num_walks: int, walk_length: int, p: float, q: float): + """ + :param log_level: Log level of Node2Vec. + :param num_walks: Number of random walks from each node. + :param walk_length: Random walk length. + :param p: Controls the likelihood of immediately revisiting previous node. + :param q: Controls the likelihood of exploring outward nodes. + """ + if walk_length <= 1: + raise ValueError("Random walks have at least two nodes.") + + super(Graph, self).__init__(log_level=log_level) + self.num_walks = num_walks + self.walk_length = walk_length + self.p = 1 / p + self.q = 1 / q + self.token_parser = TokenParser() + + def node2vec_walk(self, start_node: GraphNode, edges: Dict[Tuple[int, int], None], + nodes: List[GraphNode]) -> List[GraphNode]: + """ + Simulate a random walk starting from start node. + + :param start_node: Starting node for random walk. + :param edges: Dict for storing mapping from node id pairs to transition probabilities. + :param nodes: List of UAST nodes. + :return: List of GraphNodes in random walk. + """ + walk = [None] * self.walk_length + prev_node = walk[0] = start_node + cur_node = walk[1] = nodes[random.choice(start_node.neighbors)] + + for i in range(2, self.walk_length): + J, q = edges[(prev_node.id, cur_node.id)] + kk = np.random.randint(len(J)) + + # Draw a sample from discrete distribution at constant time. + if np.random.rand() < q[kk]: + ind = kk + else: + ind = J[kk] + + prev_node = cur_node + cur_node = walk[i] = nodes[cur_node.neighbors[ind]] + + return walk + + def simulate_walks(self, uasts) -> Iterator[List[GraphNode]]: + """ + Repeatedly simulate random walks from each node. + + :param uasts: List of UASTs. + :return: Iterator over random walks generated for the input UASTs. + """ + for uast, filename in zip(uasts.uasts, uasts.filenames): + nodes, edges = self._preprocess_uast(uast) + n_nodes = len(nodes) + + if n_nodes == 1: + self._log.info("Skipping UAST for %s: has a single node." % filename) + continue + + self._preprocess_transition_probs(nodes, edges) + self._log.info("Walk iteration:") + + for walk_iter in range(self.num_walks): + self._log.info("%d/%d" % (walk_iter + 1, self.num_walks)) + iter_nodes = set(node.id for node in nodes) + + while iter_nodes: + node = nodes[random.sample(iter_nodes, 1)[0]] + walk = self.node2vec_walk(node, edges, nodes) + yield walk + + for walk_node in walk: + if walk_node.id in iter_nodes: + iter_nodes.remove(walk_node.id) + + def _get_log_name(self): + return "Graph" + + def _get_tokens(self, uast_node) -> List[str]: + """ + Return node tokens. + + :param uast_node: UAST node. + :return: List of tokens. + """ + return ["RoleId_%d" % role for role in uast_node.roles] + \ + list(self.token_parser.process_token(uast_node.token)) + + def _preprocess_transition_probs(self, nodes: List[GraphNode], + edges: Dict[Tuple[int, int], None]) -> None: + """ + Preprocessing of transition probabilities for guiding the random walks. + + :param nodes: List of GraphNodes in UAST. + :param edges: Dict for storing mapping from node id pairs to transition probabilities. + """ + self._log.info("Preprocessing transition probabilities.") + for edge in edges: + unnormalized_probs = np.array([ + self.p if dst_nbr == edge[0] else + 1 if (dst_nbr, edge[0]) in edges else + self.q for dst_nbr in nodes[edge[1]].neighbors + ]) + edges[edge] = alias_setup(unnormalized_probs / unnormalized_probs.sum()) + + def _preprocess_uast(self, root) -> Tuple[List[GraphNode], Dict[Tuple[int, int], None]]: + """ + Add neighbors information to UAST nodes. + + :param root: Root node in UAST. + :return: Nodes and edges in the UAST. + """ + def create_node(node, id): + return GraphNode(id=id, neighbors=[], tokens=self._get_tokens(node)) + + self._log.info("Preprocessing UAST nodes.") + edges = {} + nodes = [create_node(root, 0)] + n_nodes = 1 + + for node, node_idx in node_iterator(root): + for child in node.children: + nodes.append(create_node(child, n_nodes)) + nodes[n_nodes].neighbors.append(node_idx) + nodes[node_idx].neighbors.append(n_nodes) + edges[(node_idx, n_nodes)] = edges[(n_nodes, node_idx)] = None + n_nodes += 1 + + return nodes, edges + + +def alias_setup(probs: np.array) -> Tuple[np.array, np.array]: + """ + Compute utility lists for non-uniform sampling from discrete distributions. + Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with + -many-discrete-outcomes/ + for details + + :param probs: Discrete distribution. + :return: Two helper tables. + """ + K = len(probs) + q = probs * K + J = np.zeros(K, dtype=np.int) + + # Sort the data into the outcomes with probabilities that are larger and smaller than 1/K. + smaller = np.where(q < 1.0)[0] + larger = np.where(q >= 1.0)[0] + s_idx = len(smaller) - 1 + l_idx = len(larger) - 1 + + # Loop through and create little binary mixtures that appropriately allocate the larger + # outcomes over the overall uniform mixture. + while s_idx >= 0 and l_idx >= 0: + small = smaller[s_idx] + large = larger[l_idx] + J[small] = large + q[large] += q[small] - 1.0 + + if q[large] < 1.0: + smaller[s_idx] = large + l_idx -= 1 + else: + s_idx -= 1 + + return J, q diff --git a/role2vec/roles/base.py b/role2vec/roles/base.py new file mode 100644 index 0000000..8f5b761 --- /dev/null +++ b/role2vec/roles/base.py @@ -0,0 +1,92 @@ +import os + +from sklearn.externals import joblib + +from ast2vec.token_parser import TokenParser +from role2vec.map_reduce import MapReduce +from role2vec.utils import read_embeddings + +ROLES_MODELS = dict() + + +def register_roles_model(cls): + """ + Check some conventions for class declaration and add it to ROLES_MODELS. + + :param cls: Class for roles prediction. + """ + base = "Roles" + assert issubclass(cls, RolesBase), "Must be a subclass of RolesBase." + assert cls.__name__.startswith(base), "Make sure to start your class name with %s." % (base, ) + ROLES_MODELS[cls.__name__[len(base):].lower()] = cls + + return cls + + +class RolesBase(MapReduce): + """ + Base class for roles prediction. + """ + + def __init__(self, log_level: str, num_processes: int, emb_path: str): + """ + :param log_level: Log level of RolesBase. + :param num_processes: Number of running processes. There's always one additional process + for reducing data. + :param emb_path: Path to stored roles embeddings. + """ + super(RolesBase, self).__init__(log_level=log_level, num_processes=num_processes) + self.emb, self.roles = read_embeddings(emb_path) + self.model = None + self.token_parser = TokenParser() + + def save(self, model_path: str) -> None: + """ + Store trained model on disk. + + :param model_path: Path for storing trained model. + """ + if self.model is None: + raise ValueError("Model is empty.") + self._log.info("Saving model to %s.", model_path) + joblib.dump(self.model, model_path) + + def load(self, model_path: str) -> None: + """ + Load trained model from disk. + + :param model_path: Path to trained model. + """ + if not os.path.exists(model_path): + raise ValueError("Provided path to model doesn't exist: %s", model_path) + self.model = joblib.load(model_path) + + def train(self, fname: str) -> None: + """ + Train model. + + :param fname: Path to train file with filepaths to stored UASTs. + """ + raise NotImplementedError + + def test(self, fname: str) -> None: + """ + Test model. + + :param fname: Path to test file with filepaths to stored UASTs. + """ + raise NotImplementedError + + +def roles_entry(args): + RolesModel = ROLES_MODELS[args.algorithm] + rm = RolesModel(args.log_level, args.processes, args.embeddings) + + if args.train: + rm.train(args.train) + rm.save(args.model) + else: + rm.load(args.model) + + if args.test: + rm.test(args.test) diff --git a/role2vec/roles/mlp.py b/role2vec/roles/mlp.py new file mode 100644 index 0000000..7c85b52 --- /dev/null +++ b/role2vec/roles/mlp.py @@ -0,0 +1,141 @@ +from itertools import chain +import time +from typing import Dict, Tuple + +import numpy as np +from sklearn.neural_network import MLPClassifier + +from ast2vec.uast import UASTModel +from role2vec.map_reduce import MapReduce +from role2vec.roles_base import register_roles_model, RolesBase +from role2vec.utils import node_iterator, read_paths + + +@register_roles_model +class RolesMLP(RolesBase): + """ + Predicts roles using Multi-Layer Perceptron. + """ + + def train(self, fname: str) -> None: + """ + Train model. + + :param fname: Path to train file with filepaths to stored UASTs. + """ + paths = read_paths(fname) + + self._log.info("Train model.") + self.model = MLPClassifier(random_state=1, verbose=True) + self.model.classes_ = sorted(self.roles.values()) + counter = 0 + start = time.time() + + @MapReduce.wrap_queue_out() + def train_uast(self, result): + nonlocal counter, start + X, y = result + counter += 1 + self.model.partial_fit(X, y) + print(self.model.loss_, time.time() - start, counter) + + self.parallelize(paths, _process_uast, train_uast) + self._log.info("Finished training.") + + def test(self, fname: str) -> None: + """ + Test model. + + :param fname: Path to test file with filepaths to stored UASTs. + """ + paths = read_paths(fname) + + self._log.info("Test model.") + y_real, y_pred = [], [] + + @MapReduce.wrap_queue_out() + def test_uast(self, result): + nonlocal y_real, y_pred + X, y = result + y_real.extend(y) + y_pred.extend(self.model.predict_proba(X)) + + self.parallelize(paths, _process_uast, test_uast) + np.save("y_real.npy", y_real) + np.save("y_pred.npy", y_pred) + self._log.info("Finished testing.") + + def _mean_vec(self, node) -> Tuple[np.array, int]: + """ + Calculate mean of role/token embeddings for a node. + + :param node: UAST node. + :return: Mean of role/token embeddings and their total number. + """ + tokens = [t for t in chain(node.token, ("RoleId_%d" % role for role in node.roles)) + if t in self.emb] + if not tokens: + return None, 0 + return np.mean([self.emb[t] for t in tokens], axis=0), len(tokens) + + def _mean_vecs(self, root) -> Tuple[Dict[int, np.array], Dict[int, np.array]]: + """ + Calculate mean of role/token embeddings for nodes and their children in a UAST. + + :param root: UAST root node. + :return: Mappings from node indices to their parent's and their childrens' mean role/token + embeddings. + """ + node_vecs = {0: self._mean_vec(root)} + child_vecs = {} + parent_vecs = {0: None} + n_nodes = 1 # incremented in accoradance with node_iterator + + for node, node_idx in node_iterator(root): + node_child_vecs = [] + node_child_ns = [] + + for child in node.children: + child_vec = self._mean_vec(child) + node_vecs[n_nodes] = child_vec + parent_vecs[n_nodes] = node_vecs[node_idx][0] + node_child_vecs.append(child_vec[0]) + node_child_ns.append(child_vec[1]) + n_nodes += 1 + + node_child_vecs = list(filter(lambda x: x is not None, node_child_vecs)) + node_child_ns = list(filter(lambda x: x != 0, node_child_ns)) + + if node_child_vecs: + child_vecs[node_idx] = np.average(node_child_vecs, axis=0, weights=node_child_ns) + else: + child_vecs[node_idx] = None + + return child_vecs, parent_vecs + + +@MapReduce.wrap_queue_in +def _process_uast(self, filename: str) -> Tuple[np.array, np.array]: + """ + Convert UAST into feature and label arrays. + Had to be defined outside of RolesMLP so that we don't suppply `self` twice. + + :param filename: Path to stored UAST. + :return: Array of concatenated mean parent and children role/token embeddings for each node and + the corresponding array of node roles. + """ + X, y = [], [] + uast_model = UASTModel().load(filename) + + for uast in uast_model.uasts: + child_vecs, parent_vecs = self._mean_vecs(uast) + for node, node_idx in node_iterator(uast): + child_vec = child_vecs[node_idx] + parent_vec = parent_vecs[node_idx] + if child_vec is not None and parent_vec is not None: + labels = np.zeros(len(self.roles), dtype=np.int8) + labels[[self.roles["RoleId_%d" % role] for role in node.roles]] = 1 + X.append(np.concatenate((child_vec, parent_vec))) + y.append(labels) + + return np.array(X), np.array(y) diff --git a/role2vec/stats.py b/role2vec/stats.py new file mode 100644 index 0000000..2bfa2d2 --- /dev/null +++ b/role2vec/stats.py @@ -0,0 +1,56 @@ +from collections import Counter +import json + +from ast2vec.uast import UASTModel +from role2vec.map_reduce import MapReduce +from role2vec.utils import node_iterator, read_paths + + +class RolesStats(MapReduce): + """ + Collects statistics for number of nodes w.r.t. number of node roles in all UASTs. + """ + + def calc(self, fname: str, stat_output: str, susp_output: str) -> None: + """ + Compute statistics and store them in JSON format. + + :param fname: Path to file with filepaths to stored UASTs. + :param stat_output: Path for storing JSON file with statistics. + :param susp_output: Path for storing txt file with info about suspicious UASTs. The file + has three columns: filepath to UAST, number of nodes in UAST, number of + nodes without roles in UAST. + """ + paths = read_paths(fname) + global_counter = Counter() + suspicious = [] + + @MapReduce.wrap_queue_in + def process_uast(self, filename): + counter = Counter() + uast_model = UASTModel().load(filename) + for uast in uast_model.uasts: + for node, _ in node_iterator(uast): + counter[len(node.roles)] += 1 + return counter, filename + + @MapReduce.wrap_queue_out() + def combine_stat(self, result): + nonlocal global_counter + counter, filename = result + global_counter.update(counter) + if 0 in counter: + suspicious.append((filename, sum(counter.values()), counter[0])) + + self.parallelize(paths, process_uast, combine_stat) + with open(stat_output, "w") as fout: + json.dump(global_counter, fout) + with open(susp_output, "w") as fout: + for susp_entry in suspicious: + fout.write(", ".join(map(str, susp_entry)) + "\n") + self._log.info("Finished collecting statistics.") + + +def stats_entry(args): + role_stat = RolesStats(args.log_level, args.processes) + role_stat.calc(args.input, args.stat, args.susp) diff --git a/role2vec/tests/__init__.py b/role2vec/tests/__init__.py new file mode 100644 index 0000000..9aeddfc --- /dev/null +++ b/role2vec/tests/__init__.py @@ -0,0 +1,5 @@ +from modelforge.logs import setup_logging + + +def setup(): + setup_logging("INFO") diff --git a/role2vec/tests/models.py b/role2vec/tests/models.py new file mode 100644 index 0000000..8a9feb9 --- /dev/null +++ b/role2vec/tests/models.py @@ -0,0 +1,5 @@ +import os + +UAST = os.path.join(os.path.dirname(__file__), "uast.asdf") +UAST_FILE = os.path.join(os.path.dirname(__file__), "uast.txt") +VOCAB = os.path.join(os.path.dirname(__file__), "vocab.txt") diff --git a/role2vec/tests/test_roles_base.py b/role2vec/tests/test_roles_base.py new file mode 100644 index 0000000..bee53d0 --- /dev/null +++ b/role2vec/tests/test_roles_base.py @@ -0,0 +1,51 @@ +import os +import tempfile +import unittest + +from sklearn.externals import joblib + +from role2vec.roles.base import RolesBase + + +class RolesBaseTests(unittest.TestCase): + def setUp(self): + self.model = 1334 + with tempfile.NamedTemporaryFile(delete=False) as model_path: + self.model_path = model_path.name + joblib.dump(self.model, self.model_path) + with tempfile.NamedTemporaryFile() as emb_path: + self.rb = RolesBase(log_level="INFO", num_processes=1, emb_path=emb_path.name) + + def tearDown(self): + os.remove(self.model_path) + + def test_save(self): + with self.assertRaises(ValueError): + self.rb.save("") + try: + self.rb.model = self.model + with tempfile.NamedTemporaryFile() as model_path: + self.assertIsNone(self.rb.save(model_path.name)) + finally: + self.rb.model = None + + def test_load(self): + with self.assertRaises(ValueError): + self.rb.load("") + try: + self.rb.load(self.model_path) + self.assertEqual(self.rb.model, self.model) + finally: + self.rb.model = None + + def test_train(self): + with self.assertRaises(NotImplementedError): + self.rb.train("") + + def test_test(self): + with self.assertRaises(NotImplementedError): + self.rb.test("") + + +if __name__ == "__main__": + unittest.main() diff --git a/role2vec/tests/test_stats.py b/role2vec/tests/test_stats.py new file mode 100644 index 0000000..aa03c2a --- /dev/null +++ b/role2vec/tests/test_stats.py @@ -0,0 +1,23 @@ +import json +import tempfile +import unittest + +from role2vec.stats import RolesStats +import role2vec.tests.models as paths + + +class RolesStatsTests(unittest.TestCase): + def setUp(self): + self.rs = RolesStats(log_level="INFO", num_processes=1) + + def test_calc(self): + with tempfile.NamedTemporaryFile() as stat, tempfile.NamedTemporaryFile() as susp: + self.rs.calc(paths.UAST_FILE, stat.name, susp.name) + role_stats = json.loads(stat.read().decode("utf8")) + self.assertEqual(role_stats, {"0": 1, "1": 498, "2": 830, "3": 1634, "4": 1407, + "5": 412, "6": 718, "7": 2, "8": 4, "10": 359, + "11": 411}) + + +if __name__ == "__main__": + unittest.main() diff --git a/role2vec/tests/test_vocab.py b/role2vec/tests/test_vocab.py new file mode 100644 index 0000000..2606d58 --- /dev/null +++ b/role2vec/tests/test_vocab.py @@ -0,0 +1,23 @@ +import unittest + +from role2vec.vocab import Vocab +import role2vec.tests.models as paths + + +class VocabTests(unittest.TestCase): + def setUp(self): + self.vocab = Vocab(log_level="INFO", num_processes=1) + self.words_true = {} + with open(paths.VOCAB) as fin: + for line in fin: + word, count = line.split() + self.words_true[word] = int(count) + + def test_create(self): + words = self.vocab.create([paths.UAST]) + self.assertEqual(len(words), 539) + self.assertEqual(words, self.words_true) + + +if __name__ == "__main__": + unittest.main() diff --git a/role2vec/tests/uast.asdf b/role2vec/tests/uast.asdf new file mode 100644 index 0000000..b45d4b4 Binary files /dev/null and b/role2vec/tests/uast.asdf differ diff --git a/role2vec/tests/uast.txt b/role2vec/tests/uast.txt new file mode 100644 index 0000000..280aa45 --- /dev/null +++ b/role2vec/tests/uast.txt @@ -0,0 +1 @@ +role2vec/tests/uast.asdf \ No newline at end of file diff --git a/role2vec/tests/vocab.txt b/role2vec/tests/vocab.txt new file mode 100755 index 0000000..47e21cf --- /dev/null +++ b/role2vec/tests/vocab.txt @@ -0,0 +1,539 @@ +RoleId_18 5226 +RoleId_1 4165 +RoleId_85 2939 +RoleId_49 1908 +RoleId_45 1728 +RoleId_41 1095 +RoleId_47 1082 +RoleId_4 863 +RoleId_89 833 +RoleId_87 774 +RoleId_2 719 +RoleId_86 666 +RoleId_48 624 +RoleId_99 568 +self 530 +RoleId_110 349 +path 344 +RoleId_6 328 +RoleId_7 326 +RoleId_3 321 +RoleId_19 306 +RoleId_61 281 +bucket 262 +blob 228 +RoleId_105 211 +RoleId_46 211 +content 200 +assert 175 +model 154 +RoleId_50 147 +RoleId_80 135 +RoleId_94 105 +test 103 +RoleId_11 101 +RoleId_107 97 +manag 91 +RoleId_109 90 +RoleId_63 87 +name 87 +RoleId_62 87 +checkpoint 85 +return 83 +equal 82 +exists 78 +true 78 +file 78 +RoleId_42 71 +get 67 +RoleId_79 67 +RoleId_96 63 +nil 62 +RoleId_100 55 +false 55 +type 50 +RoleId_35 48 +format 47 +RoleId_43 46 +RoleId_68 46 +the 45 +delete 45 +RoleId_103 44 +notebook 44 +old 44 +directori 44 +RoleId_95 38 +new 38 +RoleId_81 37 +RoleId_21 37 +other 36 +dir 36 +list 34 +RoleId_20 34 +RoleId_5 32 +string 32 +raise 30 +RoleId_83 30 +from 29 +txt 28 +not 27 +save 26 +base 25 +google 24 +for 23 +upload 23 +blobs 21 +creat 21 +instanc 21 +last 21 +modifi 20 +web 20 +RoleId_64 20 +error 19 +debug 19 +storag 19 +rror 19 +text 18 +RoleId_39 18 +folder 18 +max 18 +httpe 18 +RoleId_91 18 +isinst 17 +client 17 +hidden 17 +ishidden 16 +mimetyp 16 +parse 15 +default 15 +cloud 15 +args 15 +applic 14 +RoleId_82 14 +class 14 +param 14 +size 13 +log 13 +RoleId_93 13 +json 13 +parent 13 +nbformat 13 +rename 13 +unicod 12 +RoleId_27 12 +cache 12 +RoleId_17 12 +RoleId_71 12 +endswith 12 +RoleId_70 12 +fetch 11 +throw 11 +errno 11 +create 10 +RoleId_26 10 +help 10 +config 10 +ipynb 10 +uuid 10 +startswith 10 +obj 10 +writabl 9 +RoleId_101 9 +result 9 +none 9 +will 9 +hook 9 +xdirectori 9 +pickle 9 +prefix 9 +RoleId_77 8 +isnone 8 +ofthe 8 +updat 8 +RoleId_15 8 +RoleId_78 8 +post 8 +acheckpoint 7 +dict 7 +dotted 7 +project 7 +read 7 +RoleId_30 7 +hide 7 +str 6 +download 6 +messag 6 +islice 6 +broken 6 +jgscm 6 +bcontent 6 +gcs 6 +ospath 6 +found 6 +pipe 6 +this 6 +utf 6 +blah 6 +data 6 +request 6 +valid 6 +datetim 6 +encode 5 +files 5 +asstr 5 +url 5 +epipe 5 +set 5 +delimit 5 +member 5 +value 5 +raises 5 +staticmethod 5 +info 5 +bool 5 +except 5 +such 5 +afile 5 +ofclass 5 +with 5 +run 5 +python 5 +nosuch 4 +blahblah 4 +cls 4 +languag 4 +program 4 +saving 4 +untitl 4 +encod 4 +reads 4 +builds 4 +bad 4 +decode 4 +version 4 +slash 4 +adirectori 4 +bytes 4 +tornado 4 +plain 4 +used 4 +licens 4 +kwargs 4 +sfor 4 +tuple 3 +notebooknod 3 +servic 3 +gsclient 3 +xipynb 3 +forbidden 3 +current 3 +ascii 3 +should 3 +one 3 +dumps 3 +tmpl 3 +stream 3 +keyfil 3 +anotebook 3 +generic 3 +you 3 +RoleId_52 3 +replac 3 +RoleId_34 3 +ifcont 3 +len 3 +output 3 +isrequest 3 +which 3 +fold 3 +main 3 +force 3 +node 3 +and 3 +unescap 3 +author 3 +rsplit 3 +vadim 2 +sfrom 2 +develop 2 +apath 2 +github 2 +restor 2 +raw 2 +tokeep 2 +traitlet 2 +copy 2 +beencod 2 +reason 2 +jupyt 2 +part 2 +com 2 +socket 2 +djgscm 2 +non 2 +provid 2 +fmt 2 +anoth 2 +sys 2 +togcs 2 +ifbase 2 +requir 2 +execut 2 +exc 2 +paramet 2 +form 2 +gcloud 2 +two 2 +any 2 +bydefault 2 +setup 2 +src 2 +RoleId_51 2 +packag 2 +alist 2 +keep 2 +tothe 2 +key 2 +atthe 2 +descript 2 +wrap 2 +decodebyt 2 +isnot 2 +nofile 2 +while 2 +int 2 +includ 2 +append 2 +convert 2 +check 2 +bedecod 2 +case 2 +asbase 2 +attribut 2 +asutf 2 +popul 2 +classmethod 2 +amodel 2 +mit 2 +ifyou 2 +iftext 2 +decod 2 +orbase 2 +anon 2 +ifnot 2 +donot 2 +https 2 +can 2 +just 2 +adict 2 +split 2 +exist 2 +octet 2 +properti 2 +cells 2 +join 2 +state 2 +may 2 +sub 2 +mime 2 +encodebyt 2 +gets 2 +ext 2 +mixin 2 +bbytes 2 +splitext 2 +only 1 +asingl 1 +thereof 1 +relat 1 +single 1 +mark 1 +dot 1 +root 1 +sown 1 +tochang 1 +script 1 +unhandl 1 +call 1 +tosplit 1 +inwhich 1 +super 1 +where 1 +inside 1 +asconvert 1 +rfind 1 +interpret 1 +bepopul 1 +ordirectori 1 +use 1 +process 1 +ingcs 1 +try 1 +beused 1 +itexist 1 +redefin 1 +end 1 +snew 1 +writes 1 +always 1 +same 1 +called 1 +api 1 +uses 1 +revers 1 +ralreadi 1 +pre 1 +sname 1 +RoleId_44 1 +nbconvert 1 +jsone 1 +ifformat 1 +ifthe 1 +repr 1 +desktop 1 +splits 1 +count 1 +into 1 +approv 1 +ifunicod 1 +upclass 1 +ajson 1 +unexpect 1 +importstr 1 +ifpath 1 +setuptool 1 +level 1 +reader 1 +tointerpret 1 +serial 1 +jgcsm 1 +miss 1 +change 1 +items 1 +bysave 1 +failed 1 +double 1 +asunicod 1 +isunknown 1 +interact 1 +start 1 +when 1 +beeither 1 +time 1 +callabl 1 +itertool 1 +own 1 +instal 1 +iffals 1 +behandl 1 +librari 1 +indic 1 +status 1 +noconvert 1 +sort 1 +needed 1 +faster 1 +either 1 +becal 1 +someth 1 +isneed 1 +tear 1 +via 1 +unknown 1 +specifi 1 +open 1 +RoleId_98 1 +alreadi 1 +classifi 1 +consid 1 +html 1 +handl 1 +oper 1 +agiven 1 +retriev 1 +empty 1 +orhtml 1 +ortupl 1 +tocach 1 +down 1 +next 1 +acont 1 +offile 1 +names 1 +tonew 1 +extract 1 +saved 1 +pick 1 +touse 1 +greater 1 +ashidden 1 +limit 1 +nbclass 1 +agener 1 +but 1 +was 1 +namespac 1 +ascript 1 +intern 1 +spath 1 +defin 1 +orimportstr 1 +loads 1 +update 1 +email 1 +stdout 1 +metadata 1 +object 1 +astext 1 +softwar 1 +users 1 +intend 1 +otherwis 1 +context 1 +structur 1 +readme 1 +toprocess 1 +apart 1 +cell 1 +dirnam 1 +unittest 1 +ifempti 1 +sign 1 +isused 1 +ofcheckpoint 1 +like 1 +toopen 1 +asvers 1 +keyword 1 +ondisk 1 +ingoogl 1 +onthe 1 +isipynb 1 +disk 1 +markovtsev 1 +explicit 1 +must 1 +common 1 +osi 1 +topic 1 +whether 1 +RoleId_111 1 +code 1 +iftrue 1 +trust 1 +ipython 1 +errors 1 +ofnbformat 1 +wtf 1 +input 1 +RoleId_84 1 +ifdefin 1 +collaps 1 +tech 1 +sourc 1 +nump 1 +given 1 +account 1 +escape 1 +audienc 1 +alpha 1 \ No newline at end of file diff --git a/role2vec/utils.py b/role2vec/utils.py new file mode 100644 index 0000000..c71092c --- /dev/null +++ b/role2vec/utils.py @@ -0,0 +1,52 @@ +from itertools import islice +from typing import Dict, List, Tuple + +import numpy as np + + +def node_iterator(root): + """ + Enumerate UAST nodes using depth-first approach. + """ + queue = [(root, 0)] + n_nodes = 1 + while queue: + node, node_idx = queue.pop() + yield node, node_idx + for child in node.children: + queue.append((child, n_nodes)) + n_nodes += 1 + + +def read_embeddings(emb_path: str) -> Tuple[Dict[str, np.array], List[str]]: + emb = {} + roles = [] + + with open(emb_path) as fin: + for line in fin: + word, *vec = line.split("\t") + emb[word] = np.array(vec, dtype=np.float) + if word.startswith("RoleId_"): + roles.append(word) + + roles = {role: i for i, role in enumerate(roles)} + return emb, roles + + +def read_paths(fname: str) -> List[str]: + with open(fname) as fin: + paths = [line.strip() for line in fin.readlines()] + if not paths: + raise ValueError("Make sure the file is not empty!") + return paths + + +def read_vocab(vocab_path: str, num_words: int=None) -> List[str]: + with open(vocab_path) as fin: + words = [line.split(" ")[0] for line in islice(fin, num_words)] + return words + + +def save_vocab(vocab_path: str, vocab: Dict[str, int]) -> None: + with open(vocab_path, "w") as fout: + fout.write("\n".join(map(lambda x: "%s %d" % x, vocab.most_common()))) diff --git a/role2vec/vocab.py b/role2vec/vocab.py new file mode 100644 index 0000000..dd85d3a --- /dev/null +++ b/role2vec/vocab.py @@ -0,0 +1,68 @@ +from collections import Counter +from typing import Dict, List + +from ast2vec.token_parser import TokenParser +from ast2vec.uast import UASTModel +from role2vec.map_reduce import MapReduce +from role2vec.utils import node_iterator, read_paths, save_vocab + + +class Vocab(MapReduce): + """ + Collects vocabulary from UASTs. + """ + + def __init__(self, log_level: str, num_processes: int): + """ + :param log_level: Log level of Vocab. + :param num_processes: Number of running processes. There's always one additional process + for reducing data. + """ + super(Vocab, self).__init__(log_level=log_level, num_processes=num_processes) + self.token_parser = TokenParser() + + def create(self, files: List[str]) -> Dict[str, int]: + """ + Create vocabulary by processing supplied UASTs. + + :param files: List of filepaths to stored UASTs. + :return: Dict with tokens and their number of occurrences. + """ + vocab = Counter() + + @MapReduce.wrap_queue_in + def uasts_vocab(self, filename): + uast_model = UASTModel().load(filename) + tokens = Counter() + for uast in uast_model.uasts: + for node, _ in node_iterator(uast): + tokens.update(self._get_tokens(node)) + return tokens + + @MapReduce.wrap_queue_out() + def combine_vocab(self, result): + nonlocal vocab + vocab.update(result) + + self.parallelize(files, uasts_vocab, combine_vocab) + return vocab + + def _get_log_name(self): + return "Vocab" + + def _get_tokens(self, uast_node) -> List[str]: + """ + Return node tokens. + + :param uast_node: UAST node. + :return: List of tokens. + """ + return ["RoleId_%d" % role for role in uast_node.roles] + \ + list(self.token_parser.process_token(uast_node.token)) + + +def vocab_entry(args): + uasts = read_paths(args.input) + vocab = Vocab(args.log_level, args.processes) + words = vocab.create(uasts) + save_vocab(args.output, words) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2a179ed --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +import sys + +from setuptools import setup, find_packages + +if sys.version_info < (3, 5, 0): + typing = ["typing"] +else: + typing = [] + +setup( + name="role2vec", + description="Part of source{d}'s stack for machine learning on source code. Provides API and " + "tools to train and use models for role prediction of UAST nodes extracted from " + "Babelfish.", + version="0.0.1-alpha", + license="Apache 2.0", + author="source{d}", + author_email="machine-learning@sourced.tech", + url="https://github.com/src-d/role2vec", + download_url="https://github.com/src-d/role2vec", + packages=find_packages(exclude=("role2vec.tests",)), + entry_points={ + "console_scripts": ["role2vec=role2vec.__main__:main"], + }, + keywords=["machine learning on source code", "word2vec", "id2vec", + "github", "swivel", "nbow", "bblfsh", "babelfish"], + install_requires=["ast2vec[tf]>=0.3.4-alpha", "scikit-learn>=0.19.0"] + typing, + package_data={"": ["LICENSE", "README.md"]}, + classifiers=[ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Topic :: Software Development :: Libraries" + ] +)