Merge pull request #501 from quicktype/markov

Actually use the Markov Chain for map inference, for real this time
2018-02-08 08:41:00 -08:00 · 2018-02-08 08:41:00 -08:00 · f44293cc6c
--- a/package.json
+++ b/package.json
@ -22,10 +22,12 @@
    "handlebars": "^4.0.11",
    "immutable": "^4.0.0-rc.9",
    "is-url": "^1.2.2",
+    "js-base64": "^2.4.3",
    "lodash": "^4.17.4",
    "moment": "^2.19.3",
    "node-fetch": "^1.7.1",
    "pkg": "^4.3.0",
+    "pako": "^1.0.6",
    "pluralize": "^7.0.0",
    "stream-json": "0.5.2",
    "string-hash": "^1.1.3",
@ -36,8 +38,10 @@
    "@types/graphql": "^0.11.7",
    "@types/handlebars": "^4.0.36",
    "@types/is-url": "^1.2.28",
+    "@types/js-base64": "^2.3.1",
    "@types/lodash": "^4.14.72",
    "@types/node": "^8.0.19",
+    "@types/pako": "^1.0.0",
    "@types/pluralize": "0.0.28",
    "@types/shelljs": "^0.7.6",
    "@types/string-hash": "^1.1.1",
--- a/src/EncodedMarkovChain.ts
+++ b/src/EncodedMarkovChain.ts
--- a/src/InferMaps.ts
+++ b/src/InferMaps.ts
@ -7,14 +7,47 @@ import { defined, panic } from "./Support";
 import { TypeGraph } from "./TypeGraph";
 import { GraphRewriteBuilder, TypeRef, StringTypeMapping } from "./TypeBuilder";
 import { unifyTypes } from "./UnifyClasses";
+import { MarkovChain, load, evaluate } from "./MarkovChain";

 const mapSizeThreshold = 20;

+let markovChain: MarkovChain | undefined = undefined;
+
+function nameProbability(name: string): number {
+    if (markovChain === undefined) {
+        markovChain = load();
+    }
+    return evaluate(markovChain, name);
+}
+
 function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefined {
    // Only classes with a certain number of properties are inferred
    // as maps.
-    if (properties.size < mapSizeThreshold) {
-        return undefined;
+    const numProperties = properties.size;
+    if (numProperties < 2) return undefined;
+
+    if (numProperties < mapSizeThreshold) {
+        const names = properties.keySeq();
+        const probabilities = names.map(nameProbability);
+        const product = probabilities.reduce((a, b) => a * b, 1);
+        const probability = Math.pow(product, 1 / numProperties);
+        // The idea behind this is to have a probability around 0.0004 for
+        // n=1, up to around 1.0 for n=20.  I.e. when we only have a few
+        // properties, they need to look really weird to infer a map, but
+        // when we have more we'll accept more ordinary names.  The details
+        // of the formula are immaterial because I pulled it out of my ass.
+        const exponent = 5;
+        const scale = Math.pow(22, exponent);
+        const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
+        if (probability > limit) return undefined;
+
+        /*
+        console.log(
+            `limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
+                probabilities.toArray()
+            )} is ${limit}, we are at ${probability}`
+        );
+        */
    }

    // FIXME: simplify this - it's no longer necessary with the new
--- a/src/MarkovChain.ts
+++ b/src/MarkovChain.ts
@ -1,10 +1,13 @@
 "use strict";

-import * as fs from "fs";
+import { Base64 } from "js-base64";

 import { panic } from "./Support";
+import { encodedMarkovChain } from "./EncodedMarkovChain";
+import * as pako from "pako";

-export type SubTrie = number | undefined | Trie;
+// This must be null, not undefined, because we read it from JSON.
+export type SubTrie = number | null | Trie;
 export type Trie = {
    count: number;
    arr: SubTrie[];
@ -18,7 +21,7 @@ export type MarkovChain = {
 function makeTrie(): Trie {
    const arr: SubTrie[] = [];
    for (let i = 0; i < 128; i++) {
-        arr.push(undefined);
+        arr.push(null);
    }
    return { count: 0, arr };
 }
@ -34,17 +37,17 @@ function lookup(t: Trie, seq: string, i: number): number | undefined {
            return panic("Malformed trie");
        }
        const n = t.arr[first];
+        if (n === null) {
+            return undefined;
+        }
        if (typeof n === "object") {
            return panic("Malformed trie");
        }
-        if (n === undefined) {
-            return undefined;
-        }
        return n / t.count;
    }

    const st = t.arr[first];
-    if (st === undefined) {
+    if (st === null) {
        return undefined;
    }
    if (typeof st !== "object") {
@ -64,11 +67,10 @@ function increment(t: Trie, seq: string, i: number): void {
            return panic("Malformed trie");
        }
        let n = t.arr[first];
-        if (typeof n === "object") {
-            return panic("Malformed trie");
-        }
-        if (n === undefined) {
+        if (n === null) {
            n = 0;
+        } else if (typeof n === "object") {
+            return panic("Malformed trie");
        }
        t.arr[first] = n + 1;
        t.count += 1;
@ -76,7 +78,7 @@ function increment(t: Trie, seq: string, i: number): void {
    }

    let st = t.arr[first];
-    if (st === undefined) {
+    if (st === null) {
        t.arr[first] = st = makeTrie();
    }
    if (typeof st !== "object") {
@ -85,9 +87,7 @@ function increment(t: Trie, seq: string, i: number): void {
    return increment(st, seq, i + 1);
 }

-export function train(filename: string, depth: number): MarkovChain {
-    const contents = fs.readFileSync(filename).toString();
-    const lines = contents.split("\n");
+export function train(lines: string[], depth: number): MarkovChain {
    const trie = makeTrie();
    for (const l of lines) {
        for (let i = depth; i <= l.length; i++) {
@ -95,11 +95,12 @@ export function train(filename: string, depth: number): MarkovChain {
        }
    }

-    const mc = { trie, depth };
+    return { trie, depth };
+}

-    fs.writeFileSync("/tmp/markov.json", JSON.stringify(runLengthEncodeMarkovChain(mc)));
-
-    return mc;
+export function load(): MarkovChain {
+    const bytes = Base64.atob(encodedMarkovChain);
+    return JSON.parse(pako.inflate(bytes, { to: "string" }));
 }

 export function evaluate(mc: MarkovChain, word: string): number {
@ -122,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
    console.log(`"${word}": ${evaluate(mc, word)}`);
 }

-export function test(mc: MarkovChain): void {
+export function test(): void {
+    const mc = load();
+
    testWord(mc, "url");
    testWord(mc, "json");
    testWord(mc, "my_property");
@ -132,47 +135,18 @@ export function test(mc: MarkovChain): void {
    testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
    testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
    testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
+    testWord(mc, "granularity");
+    testWord(mc, "coverage");
+    testWord(mc, "postingFrequency");
+    testWord(mc, "dataFrequency");
+    testWord(mc, "units");
+    testWord(mc, "datasetOwner");
+    testWord(mc, "organization");
+    testWord(mc, "timePeriod");
+    testWord(mc, "contactInformation");
+
    testWord(
        mc,
        "\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"
    );
 }
-
-function runLengthEncodeArray<T>(arr: T[]): [number, T][] {
-    const result: [number, T][] = [];
-    if (arr.length === 0) return result;
-    let runItem: T = arr[0];
-    let runStart = 0;
-    let i = 1;
-
-    while (i < arr.length) {
-        const item = arr[i];
-        if (item !== runItem) {
-            result.push([i - runStart, runItem]);
-            runItem = item;
-            runStart = i;
-        }
-        i++;
-    }
-    result.push([i - runStart, runItem]);
-
-    return result;
-}
-
-function runLengthEncodeTrie(t: Trie): any {
-    return {
-        count: t.count,
-        arr: runLengthEncodeArray(
-            t.arr.map(x => {
-                if (typeof x === "object") {
-                    return runLengthEncodeTrie(x);
-                }
-                return x;
-            })
-        )
-    };
-}
-
-function runLengthEncodeMarkovChain(mc: MarkovChain): any {
-    return { depth: mc.depth, trie: runLengthEncodeTrie(mc.trie) };
-}
--- a/src/cli.ts
+++ b/src/cli.ts
@ -27,7 +27,7 @@ import { Readable } from "stream";
 import { panic, assert, defined, withDefault } from "./Support";
 import { introspectServer } from "./GraphQLIntrospection";
 import { getStream } from "./get-stream/index";
-import { train, test } from "./MarkovChain";
+import { train } from "./MarkovChain";

 const commandLineArgs = require("command-line-args");
 const getUsage = require("command-line-usage");
@ -50,7 +50,7 @@ export interface CLIOptions {
    graphqlServerHeader?: string[];
    template?: string;
    out?: string;
-    markovInputFilename?: string;
+    buildMarkovChain?: string;
    findSimilarClassesSchema?: string;

    noMaps: boolean;
@ -242,7 +242,7 @@ function inferOptions(opts: Partial<CLIOptions>): CLIOptions {
        quiet: opts.quiet || false,
        version: opts.version || false,
        out: opts.out,
-        markovInputFilename: opts.markovInputFilename,
+        buildMarkovChain: opts.buildMarkovChain,
        findSimilarClassesSchema: opts.findSimilarClassesSchema,
        graphqlSchema: opts.graphqlSchema,
        graphqlIntrospect: opts.graphqlIntrospect,
@ -351,9 +351,10 @@ const optionDefinitions: OptionDefinition[] = [
        description: "Make all class properties optional."
    },
    {
-        name: "markov-input-filename",
+        name: "build-markov-chain",
        type: String,
-        description: "Markov corpus filename."
+        typeLabel: "FILE",
+        description: "Markov chain corpus filename."
    },
    {
        name: "find-similar-classes-schema",
@ -399,7 +400,7 @@ const sectionsBeforeRenderers: UsageSection[] = [
    {
        header: "Options",
        optionList: optionDefinitions,
-        hide: ["no-render", "find-similar-classes-schema"]
+        hide: ["no-render", "build-markov-chain", "find-similar-classes-schema"]
    }
 ];

@ -543,9 +544,11 @@ export async function main(args: string[] | Partial<CLIOptions>) {
            console.log("Visit quicktype.io for more info.");
            return;
        }
-        if (options.markovInputFilename !== undefined) {
-            const mc = train(options.markovInputFilename, 3);
-            test(mc);
+        if (options.buildMarkovChain !== undefined) {
+            const contents = fs.readFileSync(options.buildMarkovChain).toString();
+            const lines = contents.split("\n");
+            const mc = train(lines, 3);
+            console.log(JSON.stringify(mc));
            return;
        }

--- a/test/generate-markov-corpus.py
+++ b/test/generate-markov-corpus.py
@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
 with open('acronyms.txt') as f:
    acronyms = f.read().splitlines()

+
 def all_lower(w):
    [word, _] = w
    return word.lower()

+
 def all_upper(w):
    [word, _] = w
    return word.upper()

+
 def capitalize(w):
    [word, _] = w
    return word[:1].upper() + word[1:].lower()

+
 def cap_and_upper_acro(w):
    [word, is_acro] = w
    if is_acro:
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
    else:
        return capitalize(w)

+
 def choice(items):
    total = sum([n for [n, _] in items])
    x = random.random()
@ -39,8 +44,12 @@ def choice(items):

 formats = [
    [3, [all_lower, all_lower, "_"]],
+    [1, [all_upper, all_upper, "_"]],
+    [2, [all_lower, all_lower, "-"]],
    [1, [all_upper, all_upper, "-"]],
+    [1, [all_lower, capitalize, "-"]],
    [1, [all_lower, all_lower, " "]],
+    [1, [capitalize, capitalize, " "]],
    [5, [all_lower, capitalize, ""]],
    [5, [all_lower, cap_and_upper_acro, ""]],
    [3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
@ -51,21 +60,27 @@ prefixes = [
    [1, "_"]
 ]

+
 def word():
    return [[random.choice(words), False]]

+
 def word_word():
    return [[random.choice(words), False], [random.choice(words), False]]

+
 def word_acronym():
    return [[random.choice(words), False], [random.choice(acronyms), True]]

+
 def acronym_word():
    return [[random.choice(acronyms), True], [random.choice(words), False]]

+
 def word_digit():
    return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]

+
 def word_acronym_digit():
    return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]

@ -78,6 +93,7 @@ generators = [
    [2, word_acronym_digit]
 ]

+
 def make_corpus_entry():
    words = choice(generators)()
    [first_format, rest_format, separator] = choice(formats)
--- a/test/make-encoded-markov-chain.sh
+++ b/test/make-encoded-markov-chain.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+./generate-markov-corpus.py >/tmp/corpus.txt
+../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
+gzip -9 /tmp/markov.json
+echo -n 'export const encodedMarkovChain = "'
+base64 /tmp/markov.json.gz | tr -d '\n'
+echo '";'