Merge pull request #501 from quicktype/markov

Actually use the Markov Chain for map inference, for real this time
This commit is contained in:
Mark Probst 2018-02-08 08:41:00 -08:00 коммит произвёл GitHub
Родитель 6c23967d3f 021e33b5e6
Коммит f44293cc6c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 109 добавлений и 70 удалений

Просмотреть файл

@ -22,10 +22,12 @@
"handlebars": "^4.0.11",
"immutable": "^4.0.0-rc.9",
"is-url": "^1.2.2",
"js-base64": "^2.4.3",
"lodash": "^4.17.4",
"moment": "^2.19.3",
"node-fetch": "^1.7.1",
"pkg": "^4.3.0",
"pako": "^1.0.6",
"pluralize": "^7.0.0",
"stream-json": "0.5.2",
"string-hash": "^1.1.3",
@ -36,8 +38,10 @@
"@types/graphql": "^0.11.7",
"@types/handlebars": "^4.0.36",
"@types/is-url": "^1.2.28",
"@types/js-base64": "^2.3.1",
"@types/lodash": "^4.14.72",
"@types/node": "^8.0.19",
"@types/pako": "^1.0.0",
"@types/pluralize": "0.0.28",
"@types/shelljs": "^0.7.6",
"@types/string-hash": "^1.1.1",

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -7,14 +7,47 @@ import { defined, panic } from "./Support";
import { TypeGraph } from "./TypeGraph";
import { GraphRewriteBuilder, TypeRef, StringTypeMapping } from "./TypeBuilder";
import { unifyTypes } from "./UnifyClasses";
import { MarkovChain, load, evaluate } from "./MarkovChain";
const mapSizeThreshold = 20;
let markovChain: MarkovChain | undefined = undefined;
function nameProbability(name: string): number {
if (markovChain === undefined) {
markovChain = load();
}
return evaluate(markovChain, name);
}
function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefined {
// Only classes with a certain number of properties are inferred
// as maps.
if (properties.size < mapSizeThreshold) {
return undefined;
const numProperties = properties.size;
if (numProperties < 2) return undefined;
if (numProperties < mapSizeThreshold) {
const names = properties.keySeq();
const probabilities = names.map(nameProbability);
const product = probabilities.reduce((a, b) => a * b, 1);
const probability = Math.pow(product, 1 / numProperties);
// The idea behind this is to have a probability around 0.0004 for
// n=1, up to around 1.0 for n=20. I.e. when we only have a few
// properties, they need to look really weird to infer a map, but
// when we have more we'll accept more ordinary names. The details
// of the formula are immaterial because I pulled it out of my ass.
const exponent = 5;
const scale = Math.pow(22, exponent);
const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
if (probability > limit) return undefined;
/*
console.log(
`limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
probabilities.toArray()
)} is ${limit}, we are at ${probability}`
);
*/
}
// FIXME: simplify this - it's no longer necessary with the new

Просмотреть файл

@ -1,10 +1,13 @@
"use strict";
import * as fs from "fs";
import { Base64 } from "js-base64";
import { panic } from "./Support";
import { encodedMarkovChain } from "./EncodedMarkovChain";
import * as pako from "pako";
export type SubTrie = number | undefined | Trie;
// This must be null, not undefined, because we read it from JSON.
export type SubTrie = number | null | Trie;
export type Trie = {
count: number;
arr: SubTrie[];
@ -18,7 +21,7 @@ export type MarkovChain = {
function makeTrie(): Trie {
const arr: SubTrie[] = [];
for (let i = 0; i < 128; i++) {
arr.push(undefined);
arr.push(null);
}
return { count: 0, arr };
}
@ -34,17 +37,17 @@ function lookup(t: Trie, seq: string, i: number): number | undefined {
return panic("Malformed trie");
}
const n = t.arr[first];
if (n === null) {
return undefined;
}
if (typeof n === "object") {
return panic("Malformed trie");
}
if (n === undefined) {
return undefined;
}
return n / t.count;
}
const st = t.arr[first];
if (st === undefined) {
if (st === null) {
return undefined;
}
if (typeof st !== "object") {
@ -64,11 +67,10 @@ function increment(t: Trie, seq: string, i: number): void {
return panic("Malformed trie");
}
let n = t.arr[first];
if (typeof n === "object") {
return panic("Malformed trie");
}
if (n === undefined) {
if (n === null) {
n = 0;
} else if (typeof n === "object") {
return panic("Malformed trie");
}
t.arr[first] = n + 1;
t.count += 1;
@ -76,7 +78,7 @@ function increment(t: Trie, seq: string, i: number): void {
}
let st = t.arr[first];
if (st === undefined) {
if (st === null) {
t.arr[first] = st = makeTrie();
}
if (typeof st !== "object") {
@ -85,9 +87,7 @@ function increment(t: Trie, seq: string, i: number): void {
return increment(st, seq, i + 1);
}
export function train(filename: string, depth: number): MarkovChain {
const contents = fs.readFileSync(filename).toString();
const lines = contents.split("\n");
export function train(lines: string[], depth: number): MarkovChain {
const trie = makeTrie();
for (const l of lines) {
for (let i = depth; i <= l.length; i++) {
@ -95,11 +95,12 @@ export function train(filename: string, depth: number): MarkovChain {
}
}
const mc = { trie, depth };
return { trie, depth };
}
fs.writeFileSync("/tmp/markov.json", JSON.stringify(runLengthEncodeMarkovChain(mc)));
return mc;
export function load(): MarkovChain {
const bytes = Base64.atob(encodedMarkovChain);
return JSON.parse(pako.inflate(bytes, { to: "string" }));
}
export function evaluate(mc: MarkovChain, word: string): number {
@ -122,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
console.log(`"${word}": ${evaluate(mc, word)}`);
}
export function test(mc: MarkovChain): void {
export function test(): void {
const mc = load();
testWord(mc, "url");
testWord(mc, "json");
testWord(mc, "my_property");
@ -132,47 +135,18 @@ export function test(mc: MarkovChain): void {
testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
testWord(mc, "granularity");
testWord(mc, "coverage");
testWord(mc, "postingFrequency");
testWord(mc, "dataFrequency");
testWord(mc, "units");
testWord(mc, "datasetOwner");
testWord(mc, "organization");
testWord(mc, "timePeriod");
testWord(mc, "contactInformation");
testWord(
mc,
"\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"
);
}
function runLengthEncodeArray<T>(arr: T[]): [number, T][] {
const result: [number, T][] = [];
if (arr.length === 0) return result;
let runItem: T = arr[0];
let runStart = 0;
let i = 1;
while (i < arr.length) {
const item = arr[i];
if (item !== runItem) {
result.push([i - runStart, runItem]);
runItem = item;
runStart = i;
}
i++;
}
result.push([i - runStart, runItem]);
return result;
}
function runLengthEncodeTrie(t: Trie): any {
return {
count: t.count,
arr: runLengthEncodeArray(
t.arr.map(x => {
if (typeof x === "object") {
return runLengthEncodeTrie(x);
}
return x;
})
)
};
}
function runLengthEncodeMarkovChain(mc: MarkovChain): any {
return { depth: mc.depth, trie: runLengthEncodeTrie(mc.trie) };
}

Просмотреть файл

@ -27,7 +27,7 @@ import { Readable } from "stream";
import { panic, assert, defined, withDefault } from "./Support";
import { introspectServer } from "./GraphQLIntrospection";
import { getStream } from "./get-stream/index";
import { train, test } from "./MarkovChain";
import { train } from "./MarkovChain";
const commandLineArgs = require("command-line-args");
const getUsage = require("command-line-usage");
@ -50,7 +50,7 @@ export interface CLIOptions {
graphqlServerHeader?: string[];
template?: string;
out?: string;
markovInputFilename?: string;
buildMarkovChain?: string;
findSimilarClassesSchema?: string;
noMaps: boolean;
@ -242,7 +242,7 @@ function inferOptions(opts: Partial<CLIOptions>): CLIOptions {
quiet: opts.quiet || false,
version: opts.version || false,
out: opts.out,
markovInputFilename: opts.markovInputFilename,
buildMarkovChain: opts.buildMarkovChain,
findSimilarClassesSchema: opts.findSimilarClassesSchema,
graphqlSchema: opts.graphqlSchema,
graphqlIntrospect: opts.graphqlIntrospect,
@ -351,9 +351,10 @@ const optionDefinitions: OptionDefinition[] = [
description: "Make all class properties optional."
},
{
name: "markov-input-filename",
name: "build-markov-chain",
type: String,
description: "Markov corpus filename."
typeLabel: "FILE",
description: "Markov chain corpus filename."
},
{
name: "find-similar-classes-schema",
@ -399,7 +400,7 @@ const sectionsBeforeRenderers: UsageSection[] = [
{
header: "Options",
optionList: optionDefinitions,
hide: ["no-render", "find-similar-classes-schema"]
hide: ["no-render", "build-markov-chain", "find-similar-classes-schema"]
}
];
@ -543,9 +544,11 @@ export async function main(args: string[] | Partial<CLIOptions>) {
console.log("Visit quicktype.io for more info.");
return;
}
if (options.markovInputFilename !== undefined) {
const mc = train(options.markovInputFilename, 3);
test(mc);
if (options.buildMarkovChain !== undefined) {
const contents = fs.readFileSync(options.buildMarkovChain).toString();
const lines = contents.split("\n");
const mc = train(lines, 3);
console.log(JSON.stringify(mc));
return;
}

Просмотреть файл

@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
with open('acronyms.txt') as f:
acronyms = f.read().splitlines()
def all_lower(w):
[word, _] = w
return word.lower()
def all_upper(w):
[word, _] = w
return word.upper()
def capitalize(w):
[word, _] = w
return word[:1].upper() + word[1:].lower()
def cap_and_upper_acro(w):
[word, is_acro] = w
if is_acro:
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
else:
return capitalize(w)
def choice(items):
total = sum([n for [n, _] in items])
x = random.random()
@ -39,8 +44,12 @@ def choice(items):
formats = [
[3, [all_lower, all_lower, "_"]],
[1, [all_upper, all_upper, "_"]],
[2, [all_lower, all_lower, "-"]],
[1, [all_upper, all_upper, "-"]],
[1, [all_lower, capitalize, "-"]],
[1, [all_lower, all_lower, " "]],
[1, [capitalize, capitalize, " "]],
[5, [all_lower, capitalize, ""]],
[5, [all_lower, cap_and_upper_acro, ""]],
[3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
@ -51,21 +60,27 @@ prefixes = [
[1, "_"]
]
def word():
return [[random.choice(words), False]]
def word_word():
return [[random.choice(words), False], [random.choice(words), False]]
def word_acronym():
return [[random.choice(words), False], [random.choice(acronyms), True]]
def acronym_word():
return [[random.choice(acronyms), True], [random.choice(words), False]]
def word_digit():
return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]
def word_acronym_digit():
return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]
@ -78,6 +93,7 @@ generators = [
[2, word_acronym_digit]
]
def make_corpus_entry():
words = choice(generators)()
[first_format, rest_format, separator] = choice(formats)

Просмотреть файл

@ -0,0 +1,8 @@
#!/bin/bash
./generate-markov-corpus.py >/tmp/corpus.txt
../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
gzip -9 /tmp/markov.json
echo -n 'export const encodedMarkovChain = "'
base64 /tmp/markov.json.gz | tr -d '\n'
echo '";'