Merge pull request #501 from quicktype/markov
Actually use the Markov Chain for map inference, for real this time
This commit is contained in:
Коммит
f44293cc6c
|
@ -22,10 +22,12 @@
|
|||
"handlebars": "^4.0.11",
|
||||
"immutable": "^4.0.0-rc.9",
|
||||
"is-url": "^1.2.2",
|
||||
"js-base64": "^2.4.3",
|
||||
"lodash": "^4.17.4",
|
||||
"moment": "^2.19.3",
|
||||
"node-fetch": "^1.7.1",
|
||||
"pkg": "^4.3.0",
|
||||
"pako": "^1.0.6",
|
||||
"pluralize": "^7.0.0",
|
||||
"stream-json": "0.5.2",
|
||||
"string-hash": "^1.1.3",
|
||||
|
@ -36,8 +38,10 @@
|
|||
"@types/graphql": "^0.11.7",
|
||||
"@types/handlebars": "^4.0.36",
|
||||
"@types/is-url": "^1.2.28",
|
||||
"@types/js-base64": "^2.3.1",
|
||||
"@types/lodash": "^4.14.72",
|
||||
"@types/node": "^8.0.19",
|
||||
"@types/pako": "^1.0.0",
|
||||
"@types/pluralize": "0.0.28",
|
||||
"@types/shelljs": "^0.7.6",
|
||||
"@types/string-hash": "^1.1.1",
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -7,14 +7,47 @@ import { defined, panic } from "./Support";
|
|||
import { TypeGraph } from "./TypeGraph";
|
||||
import { GraphRewriteBuilder, TypeRef, StringTypeMapping } from "./TypeBuilder";
|
||||
import { unifyTypes } from "./UnifyClasses";
|
||||
import { MarkovChain, load, evaluate } from "./MarkovChain";
|
||||
|
||||
const mapSizeThreshold = 20;
|
||||
|
||||
let markovChain: MarkovChain | undefined = undefined;
|
||||
|
||||
function nameProbability(name: string): number {
|
||||
if (markovChain === undefined) {
|
||||
markovChain = load();
|
||||
}
|
||||
return evaluate(markovChain, name);
|
||||
}
|
||||
|
||||
function shouldBeMap(properties: Map<string, ClassProperty>): Set<Type> | undefined {
|
||||
// Only classes with a certain number of properties are inferred
|
||||
// as maps.
|
||||
if (properties.size < mapSizeThreshold) {
|
||||
return undefined;
|
||||
const numProperties = properties.size;
|
||||
if (numProperties < 2) return undefined;
|
||||
|
||||
if (numProperties < mapSizeThreshold) {
|
||||
const names = properties.keySeq();
|
||||
const probabilities = names.map(nameProbability);
|
||||
const product = probabilities.reduce((a, b) => a * b, 1);
|
||||
const probability = Math.pow(product, 1 / numProperties);
|
||||
// The idea behind this is to have a probability around 0.0004 for
|
||||
// n=1, up to around 1.0 for n=20. I.e. when we only have a few
|
||||
// properties, they need to look really weird to infer a map, but
|
||||
// when we have more we'll accept more ordinary names. The details
|
||||
// of the formula are immaterial because I pulled it out of my ass.
|
||||
const exponent = 5;
|
||||
const scale = Math.pow(22, exponent);
|
||||
const limit = Math.pow(numProperties + 2, exponent) / scale + (0.004 - Math.pow(3, exponent) / scale);
|
||||
if (probability > limit) return undefined;
|
||||
|
||||
/*
|
||||
console.log(
|
||||
`limit for ${JSON.stringify(names.toArray())} - ${JSON.stringify(
|
||||
probabilities.toArray()
|
||||
)} is ${limit}, we are at ${probability}`
|
||||
);
|
||||
*/
|
||||
}
|
||||
|
||||
// FIXME: simplify this - it's no longer necessary with the new
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
"use strict";
|
||||
|
||||
import * as fs from "fs";
|
||||
import { Base64 } from "js-base64";
|
||||
|
||||
import { panic } from "./Support";
|
||||
import { encodedMarkovChain } from "./EncodedMarkovChain";
|
||||
import * as pako from "pako";
|
||||
|
||||
export type SubTrie = number | undefined | Trie;
|
||||
// This must be null, not undefined, because we read it from JSON.
|
||||
export type SubTrie = number | null | Trie;
|
||||
export type Trie = {
|
||||
count: number;
|
||||
arr: SubTrie[];
|
||||
|
@ -18,7 +21,7 @@ export type MarkovChain = {
|
|||
function makeTrie(): Trie {
|
||||
const arr: SubTrie[] = [];
|
||||
for (let i = 0; i < 128; i++) {
|
||||
arr.push(undefined);
|
||||
arr.push(null);
|
||||
}
|
||||
return { count: 0, arr };
|
||||
}
|
||||
|
@ -34,17 +37,17 @@ function lookup(t: Trie, seq: string, i: number): number | undefined {
|
|||
return panic("Malformed trie");
|
||||
}
|
||||
const n = t.arr[first];
|
||||
if (n === null) {
|
||||
return undefined;
|
||||
}
|
||||
if (typeof n === "object") {
|
||||
return panic("Malformed trie");
|
||||
}
|
||||
if (n === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
return n / t.count;
|
||||
}
|
||||
|
||||
const st = t.arr[first];
|
||||
if (st === undefined) {
|
||||
if (st === null) {
|
||||
return undefined;
|
||||
}
|
||||
if (typeof st !== "object") {
|
||||
|
@ -64,11 +67,10 @@ function increment(t: Trie, seq: string, i: number): void {
|
|||
return panic("Malformed trie");
|
||||
}
|
||||
let n = t.arr[first];
|
||||
if (typeof n === "object") {
|
||||
return panic("Malformed trie");
|
||||
}
|
||||
if (n === undefined) {
|
||||
if (n === null) {
|
||||
n = 0;
|
||||
} else if (typeof n === "object") {
|
||||
return panic("Malformed trie");
|
||||
}
|
||||
t.arr[first] = n + 1;
|
||||
t.count += 1;
|
||||
|
@ -76,7 +78,7 @@ function increment(t: Trie, seq: string, i: number): void {
|
|||
}
|
||||
|
||||
let st = t.arr[first];
|
||||
if (st === undefined) {
|
||||
if (st === null) {
|
||||
t.arr[first] = st = makeTrie();
|
||||
}
|
||||
if (typeof st !== "object") {
|
||||
|
@ -85,9 +87,7 @@ function increment(t: Trie, seq: string, i: number): void {
|
|||
return increment(st, seq, i + 1);
|
||||
}
|
||||
|
||||
export function train(filename: string, depth: number): MarkovChain {
|
||||
const contents = fs.readFileSync(filename).toString();
|
||||
const lines = contents.split("\n");
|
||||
export function train(lines: string[], depth: number): MarkovChain {
|
||||
const trie = makeTrie();
|
||||
for (const l of lines) {
|
||||
for (let i = depth; i <= l.length; i++) {
|
||||
|
@ -95,11 +95,12 @@ export function train(filename: string, depth: number): MarkovChain {
|
|||
}
|
||||
}
|
||||
|
||||
const mc = { trie, depth };
|
||||
return { trie, depth };
|
||||
}
|
||||
|
||||
fs.writeFileSync("/tmp/markov.json", JSON.stringify(runLengthEncodeMarkovChain(mc)));
|
||||
|
||||
return mc;
|
||||
export function load(): MarkovChain {
|
||||
const bytes = Base64.atob(encodedMarkovChain);
|
||||
return JSON.parse(pako.inflate(bytes, { to: "string" }));
|
||||
}
|
||||
|
||||
export function evaluate(mc: MarkovChain, word: string): number {
|
||||
|
@ -122,7 +123,9 @@ function testWord(mc: MarkovChain, word: string): void {
|
|||
console.log(`"${word}": ${evaluate(mc, word)}`);
|
||||
}
|
||||
|
||||
export function test(mc: MarkovChain): void {
|
||||
export function test(): void {
|
||||
const mc = load();
|
||||
|
||||
testWord(mc, "url");
|
||||
testWord(mc, "json");
|
||||
testWord(mc, "my_property");
|
||||
|
@ -132,47 +135,18 @@ export function test(mc: MarkovChain): void {
|
|||
testWord(mc, "2BTZIqw0ntH9MvilQ3ewNY");
|
||||
testWord(mc, "0uBTNdNGb2OY5lou41iYL52LcDq2");
|
||||
testWord(mc, "-KpqHmWuDOUnr1hmAhxp");
|
||||
testWord(mc, "granularity");
|
||||
testWord(mc, "coverage");
|
||||
testWord(mc, "postingFrequency");
|
||||
testWord(mc, "dataFrequency");
|
||||
testWord(mc, "units");
|
||||
testWord(mc, "datasetOwner");
|
||||
testWord(mc, "organization");
|
||||
testWord(mc, "timePeriod");
|
||||
testWord(mc, "contactInformation");
|
||||
|
||||
testWord(
|
||||
mc,
|
||||
"\ud83d\udebe \ud83c\udd92 \ud83c\udd93 \ud83c\udd95 \ud83c\udd96 \ud83c\udd97 \ud83c\udd99 \ud83c\udfe7"
|
||||
);
|
||||
}
|
||||
|
||||
function runLengthEncodeArray<T>(arr: T[]): [number, T][] {
|
||||
const result: [number, T][] = [];
|
||||
if (arr.length === 0) return result;
|
||||
let runItem: T = arr[0];
|
||||
let runStart = 0;
|
||||
let i = 1;
|
||||
|
||||
while (i < arr.length) {
|
||||
const item = arr[i];
|
||||
if (item !== runItem) {
|
||||
result.push([i - runStart, runItem]);
|
||||
runItem = item;
|
||||
runStart = i;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
result.push([i - runStart, runItem]);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function runLengthEncodeTrie(t: Trie): any {
|
||||
return {
|
||||
count: t.count,
|
||||
arr: runLengthEncodeArray(
|
||||
t.arr.map(x => {
|
||||
if (typeof x === "object") {
|
||||
return runLengthEncodeTrie(x);
|
||||
}
|
||||
return x;
|
||||
})
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
function runLengthEncodeMarkovChain(mc: MarkovChain): any {
|
||||
return { depth: mc.depth, trie: runLengthEncodeTrie(mc.trie) };
|
||||
}
|
||||
|
|
21
src/cli.ts
21
src/cli.ts
|
@ -27,7 +27,7 @@ import { Readable } from "stream";
|
|||
import { panic, assert, defined, withDefault } from "./Support";
|
||||
import { introspectServer } from "./GraphQLIntrospection";
|
||||
import { getStream } from "./get-stream/index";
|
||||
import { train, test } from "./MarkovChain";
|
||||
import { train } from "./MarkovChain";
|
||||
|
||||
const commandLineArgs = require("command-line-args");
|
||||
const getUsage = require("command-line-usage");
|
||||
|
@ -50,7 +50,7 @@ export interface CLIOptions {
|
|||
graphqlServerHeader?: string[];
|
||||
template?: string;
|
||||
out?: string;
|
||||
markovInputFilename?: string;
|
||||
buildMarkovChain?: string;
|
||||
findSimilarClassesSchema?: string;
|
||||
|
||||
noMaps: boolean;
|
||||
|
@ -242,7 +242,7 @@ function inferOptions(opts: Partial<CLIOptions>): CLIOptions {
|
|||
quiet: opts.quiet || false,
|
||||
version: opts.version || false,
|
||||
out: opts.out,
|
||||
markovInputFilename: opts.markovInputFilename,
|
||||
buildMarkovChain: opts.buildMarkovChain,
|
||||
findSimilarClassesSchema: opts.findSimilarClassesSchema,
|
||||
graphqlSchema: opts.graphqlSchema,
|
||||
graphqlIntrospect: opts.graphqlIntrospect,
|
||||
|
@ -351,9 +351,10 @@ const optionDefinitions: OptionDefinition[] = [
|
|||
description: "Make all class properties optional."
|
||||
},
|
||||
{
|
||||
name: "markov-input-filename",
|
||||
name: "build-markov-chain",
|
||||
type: String,
|
||||
description: "Markov corpus filename."
|
||||
typeLabel: "FILE",
|
||||
description: "Markov chain corpus filename."
|
||||
},
|
||||
{
|
||||
name: "find-similar-classes-schema",
|
||||
|
@ -399,7 +400,7 @@ const sectionsBeforeRenderers: UsageSection[] = [
|
|||
{
|
||||
header: "Options",
|
||||
optionList: optionDefinitions,
|
||||
hide: ["no-render", "find-similar-classes-schema"]
|
||||
hide: ["no-render", "build-markov-chain", "find-similar-classes-schema"]
|
||||
}
|
||||
];
|
||||
|
||||
|
@ -543,9 +544,11 @@ export async function main(args: string[] | Partial<CLIOptions>) {
|
|||
console.log("Visit quicktype.io for more info.");
|
||||
return;
|
||||
}
|
||||
if (options.markovInputFilename !== undefined) {
|
||||
const mc = train(options.markovInputFilename, 3);
|
||||
test(mc);
|
||||
if (options.buildMarkovChain !== undefined) {
|
||||
const contents = fs.readFileSync(options.buildMarkovChain).toString();
|
||||
const lines = contents.split("\n");
|
||||
const mc = train(lines, 3);
|
||||
console.log(JSON.stringify(mc));
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -8,18 +8,22 @@ with open('/usr/share/dict/words') as f:
|
|||
with open('acronyms.txt') as f:
|
||||
acronyms = f.read().splitlines()
|
||||
|
||||
|
||||
def all_lower(w):
|
||||
[word, _] = w
|
||||
return word.lower()
|
||||
|
||||
|
||||
def all_upper(w):
|
||||
[word, _] = w
|
||||
return word.upper()
|
||||
|
||||
|
||||
def capitalize(w):
|
||||
[word, _] = w
|
||||
return word[:1].upper() + word[1:].lower()
|
||||
|
||||
|
||||
def cap_and_upper_acro(w):
|
||||
[word, is_acro] = w
|
||||
if is_acro:
|
||||
|
@ -27,6 +31,7 @@ def cap_and_upper_acro(w):
|
|||
else:
|
||||
return capitalize(w)
|
||||
|
||||
|
||||
def choice(items):
|
||||
total = sum([n for [n, _] in items])
|
||||
x = random.random()
|
||||
|
@ -39,8 +44,12 @@ def choice(items):
|
|||
|
||||
formats = [
|
||||
[3, [all_lower, all_lower, "_"]],
|
||||
[1, [all_upper, all_upper, "_"]],
|
||||
[2, [all_lower, all_lower, "-"]],
|
||||
[1, [all_upper, all_upper, "-"]],
|
||||
[1, [all_lower, capitalize, "-"]],
|
||||
[1, [all_lower, all_lower, " "]],
|
||||
[1, [capitalize, capitalize, " "]],
|
||||
[5, [all_lower, capitalize, ""]],
|
||||
[5, [all_lower, cap_and_upper_acro, ""]],
|
||||
[3, [cap_and_upper_acro, cap_and_upper_acro, ""]]
|
||||
|
@ -51,21 +60,27 @@ prefixes = [
|
|||
[1, "_"]
|
||||
]
|
||||
|
||||
|
||||
def word():
|
||||
return [[random.choice(words), False]]
|
||||
|
||||
|
||||
def word_word():
|
||||
return [[random.choice(words), False], [random.choice(words), False]]
|
||||
|
||||
|
||||
def word_acronym():
|
||||
return [[random.choice(words), False], [random.choice(acronyms), True]]
|
||||
|
||||
|
||||
def acronym_word():
|
||||
return [[random.choice(acronyms), True], [random.choice(words), False]]
|
||||
|
||||
|
||||
def word_digit():
|
||||
return [[random.choice(words), False], [str(random.randint(1, random.randint(1, 200))), False]]
|
||||
|
||||
|
||||
def word_acronym_digit():
|
||||
return [[random.choice(words), False], [random.choice(acronyms), True], [str(random.randint(1, 9)), False]]
|
||||
|
||||
|
@ -78,6 +93,7 @@ generators = [
|
|||
[2, word_acronym_digit]
|
||||
]
|
||||
|
||||
|
||||
def make_corpus_entry():
|
||||
words = choice(generators)()
|
||||
[first_format, rest_format, separator] = choice(formats)
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
|
||||
./generate-markov-corpus.py >/tmp/corpus.txt
|
||||
../script/quickertype --build-markov-chain /tmp/corpus.txt >/tmp/markov.json
|
||||
gzip -9 /tmp/markov.json
|
||||
echo -n 'export const encodedMarkovChain = "'
|
||||
base64 /tmp/markov.json.gz | tr -d '\n'
|
||||
echo '";'
|
Загрузка…
Ссылка в новой задаче