Add script to retrieve data from a Mercurial repository

This commit is contained in:
Marco Castelluccio 2018-11-12 12:55:27 +01:00
Родитель 466aa8446b
Коммит 121b42a9a3
3 изменённых файлов: 48 добавлений и 0 удалений

Просмотреть файл

@ -8,5 +8,7 @@ The dataset currently contains 2110 bugs, the accuracy of the current classifier
1. Run `pip install -r requirements.txt` and `pip install -r test-requirements.txt`
2. Run `cat data/bugs.json.xz.part* | unxz > data/bugs.json`
3. Run `cat data/commits.json.xz.part* | unxz > data/commits.json`
If you update the bugs database, run `cat data/bugs.json | xz -v9 - | split -d -b 20MB - data/bugs.json.xz.part`.
If you update the commits database, run `cat data/commits.json | xz -v9 - | split -d -b 20MB - data/commits.json.xz.part`.

45
repository.py Normal file
Просмотреть файл

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
import hglib
import db
COMMITS_DB = 'data/commits.json'
def get_commits():
return db.read(COMMITS_DB)
def download_commits(repo_dir):
hg = hglib.open(repo_dir)
commits = hg.log()
def transform(commit):
return {
'rev': commit[0].decode('utf-8'),
'node': commit[1].decode('utf-8'),
'tags': commit[2].decode('utf-8'),
'branch': commit[3].decode('utf-8'),
'author': commit[4].decode('utf-8'),
'desc': commit[5].decode('utf-8'),
'date': str(commit[6]),
}
commits = [transform(commit) for commit in commits]
db.write(COMMITS_DB, commits)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('repository_dir', help='Path to the repository', action='store')
args = parser.parse_args()
download_commits(args.repository_dir)

Просмотреть файл

@ -5,3 +5,4 @@ requests==2.19.1
numpy==1.15.2
imbalanced-learn=0.3.3
spacy==2.0.12
python-hglib==2.6.1