Bug 1413254 - Write a JSON file with normalized data; r=jmaher

The new file contains changes that optimize for final file size: * The components are stored as an integer index/key into a map instead of strings. * Paths are stored in hierarchical dicts. * No indentation is used. These changes significantly redude redundancy in string data. For every file in the repo, the raw JSON size is decreased from 27,074,607 to 7,643,529 bytes. MozReview-Commit-ID: 58FWSct8W39 --HG-- extra : rebase_source : 3ccc17d58fadaaac428b2eb2d3ad357fec9afafd
2017-10-31 10:57:14 -07:00 · 2017-10-31 10:57:14 -07:00 · f8e8ee5d2e
--- a/python/mozbuild/mozbuild/frontend/mach_commands.py
+++ b/python/mozbuild/mozbuild/frontend/mach_commands.py
@ -186,6 +186,7 @@ class MozbuildFileCommands(MachCommandBase):
        import gzip

        missing_component = set()
+        seen_components = set()
        component_by_path = {}

        # TODO operate in VCS space. This requires teaching the VCS reader
@ -198,10 +199,29 @@ class MozbuildFileCommands(MachCommandBase):
                continue

            c = m['BUG_COMPONENT']
+            seen_components.add(c)
            component_by_path[p] = [c.product, c.component]

        print('Examined %d files' % len(component_by_path))

+        # We also have a normalized versions of the file to components mapping
+        # that requires far less storage space by eliminating redundant strings.
+        indexed_components = {i: [c.product, c.component]
+                              for i, c in enumerate(sorted(seen_components))}
+        components_index = {tuple(v): k for k, v in indexed_components.items()}
+        normalized_component = {
+            'components': indexed_components,
+            'paths': {}
+        }
+
+        for p, c in component_by_path.items():
+            d = normalized_component['paths']
+            while '/' in p:
+                base, p = p.split('/', 1)
+                d = d.setdefault(base, {})
+
+            d[p] = components_index[tuple(c)]
+
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

@ -215,8 +235,15 @@ class MozbuildFileCommands(MachCommandBase):
        with open(missing_json, 'wb') as fh:
            json.dump({'missing': sorted(missing_component)}, fh, indent=2)

+        indexed_components_json = os.path.join(out_dir,
+                                               'components-normalized.json')
+        print('Writing %s' % indexed_components_json)
+        with open(indexed_components_json, 'wb') as fh:
+            # Don't indent so file is as small as possible.
+            json.dump(normalized_component, fh, sort_keys=True)
+
        # Write compressed versions of JSON files.
-        for p in (components_json, missing_json):
+        for p in (components_json, indexed_components_json, missing_json):
            gzip_path = '%s.gz' % p
            print('Writing %s' % gzip_path)
            with open(p, 'rb') as ifh, gzip.open(gzip_path, 'wb') as ofh: