Bug 1430825 - [mozlint] Split work up by paths instead of by linters, r=standard8

The initial motivation for this patch, was to prevent command lines that are
too long on Windows. To that end, there is a cap to the number of paths that
can be run per job. For now that cap is set to 50. This will allow for an
average path length of 160 characters, which should be sufficient with room to
spare.

But another big benefit of this patch is that we are running more things in
parallel. Previously, mozlint ran each linter in its own subprocess, but that's
it. If running eslint is 90% of the work, it'll still only get a single
process. This means we are wasting cores as soon as the other linters are
finished.

This patch chunks the number of specified paths such that there will be N*L
jobs where 'N' is the number of cores and 'L' is the number of linters.  This
means even when there's a dominant linter, we'll be making better use of our
resources. This isn't perfect of course, as some paths might contain a small
number of files, and some will contain a very large number of files.  But it's
a start

A limitation to this approach is that if there are fewer paths specified than
there are cores, we won't schedule enough jobs per linter to use those extra
cores. One idea might be to expand specified directories and individually list
all the paths under the directory. But this has some hairy edge cases that
would be tough to catch. Doing this in a non-hacky way would also require a
medium scale refactor.

So I propose further parallelization efforts be destined for follow-ups.

MozReview-Commit-ID: JRRu13AFaii

--HG--
extra : rebase_source : 242fb71fe0af8bd2a981bd10a7216bb897fe00ac
This commit is contained in:
Andrew Halberstadt 2018-01-16 16:01:20 -05:00
Родитель 94f0c2a33e
Коммит 529aad2e2c
2 изменённых файлов: 51 добавлений и 5 удалений

Просмотреть файл

@ -10,6 +10,7 @@ import sys
import traceback
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from math import ceil
from multiprocessing import cpu_count
from subprocess import CalledProcessError
@ -56,6 +57,7 @@ class LintRoller(object):
version control or cwd.
:param lintargs: Arguments to pass to the underlying linter(s).
"""
MAX_PATHS_PER_JOB = 50 # set a max size to prevent command lines that are too long on Windows
def __init__(self, root, **lintargs):
self.parse = Parser()
@ -82,6 +84,14 @@ class LintRoller(object):
for path in paths:
self.linters.extend(self.parse(path))
def _generate_jobs(self, paths, num_procs):
"""A job is of the form (<linter:dict>, <paths:list>)."""
chunk_size = min(self.MAX_PATHS_PER_JOB, int(ceil(float(len(paths)) / num_procs)))
while paths:
for linter in self.linters:
yield linter, paths[:chunk_size]
paths = paths[chunk_size:]
def roll(self, paths=None, outgoing=None, workdir=None, num_procs=None):
"""Run all of the registered linters against the specified file paths.
@ -132,12 +142,10 @@ class LintRoller(object):
paths = map(os.path.abspath, paths)
num_procs = num_procs or cpu_count()
num_procs = min(num_procs, len(self.linters))
all_results = defaultdict(list)
with ProcessPoolExecutor(num_procs) as executor:
futures = [executor.submit(_run_worker, config, paths, **self.lintargs)
for config in self.linters]
futures = [executor.submit(_run_worker, config, p, **self.lintargs)
for config, p in self._generate_jobs(paths, num_procs)]
# ignore SIGINT in parent so we can still get partial results
# from child processes. These should shutdown quickly anyway.
orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)

Просмотреть файл

@ -5,6 +5,7 @@
from __future__ import absolute_import
import os
import platform
import sys
import mozunit
@ -76,10 +77,47 @@ def test_roll_with_failure_code(lint, lintdir, files):
lint.read(os.path.join(lintdir, 'badreturncode.yml'))
assert lint.failed is None
result = lint.roll(files)
result = lint.roll(files, num_procs=1)
assert len(result) == 0
assert lint.failed == ['BadReturnCodeLinter']
def fake_run_linters(config, paths, **lintargs):
return {'count': [1]}, []
@pytest.mark.skipif(platform.system() == 'Windows',
reason="monkeypatch issues with multiprocessing on Windows")
@pytest.mark.parametrize('num_procs', [1, 4, 8, 16])
def test_number_of_jobs(monkeypatch, lint, linters, files, num_procs):
monkeypatch.setattr(sys.modules[lint.__module__], '_run_linters', fake_run_linters)
lint.read(linters)
num_jobs = len(lint.roll(files, num_procs=num_procs)['count'])
if len(files) >= num_procs:
assert num_jobs == num_procs * len(linters)
else:
assert num_jobs == len(files) * len(linters)
@pytest.mark.skipif(platform.system() == 'Windows',
reason="monkeypatch issues with multiprocessing on Windows")
@pytest.mark.parametrize('max_paths,expected_jobs', [(1, 12), (4, 6), (16, 6)])
def test_max_paths_per_job(monkeypatch, lint, linters, files, max_paths, expected_jobs):
monkeypatch.setattr(sys.modules[lint.__module__], '_run_linters', fake_run_linters)
files = files[:4]
assert len(files) == 4
linters = linters[:3]
assert len(linters) == 3
lint.MAX_PATHS_PER_JOB = max_paths
lint.read(linters)
num_jobs = len(lint.roll(files, num_procs=2)['count'])
assert num_jobs == expected_jobs
if __name__ == '__main__':
mozunit.main()