From 210c0883f9dd37a5687ae942f20f7a68fdb10727 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Wed, 9 May 2018 12:30:44 -0400 Subject: [PATCH] lib updates --- vendor/jx_base/README.md | 32 + vendor/jx_base/__init__.py | 26 +- vendor/jx_base/container.py | 14 +- vendor/jx_base/expressions.py | 125 ++-- vendor/jx_base/facts.py | 27 + vendor/jx_base/namespace.py | 69 ++ vendor/jx_base/queries.py | 2 +- vendor/jx_base/query.py | 31 +- vendor/jx_base/schema.py | 187 +++-- vendor/jx_base/snowflake.py | 30 + vendor/jx_base/table.py | 22 + vendor/jx_elasticsearch/es09/aggop.py | 3 +- vendor/jx_elasticsearch/es09/expressions.py | 17 +- vendor/jx_elasticsearch/es09/util.py | 18 +- vendor/jx_elasticsearch/es14/__init__.py | 54 +- vendor/jx_elasticsearch/es14/aggs.py | 106 +-- vendor/jx_elasticsearch/es14/decoders.py | 201 +++-- vendor/jx_elasticsearch/es14/deep.py | 33 +- vendor/jx_elasticsearch/es14/expressions.py | 647 +++++++++------- vendor/jx_elasticsearch/es14/format.py | 24 +- vendor/jx_elasticsearch/es14/setop.py | 27 +- vendor/jx_elasticsearch/es14/util.py | 34 +- vendor/jx_elasticsearch/es52/__init__.py | 36 +- vendor/jx_elasticsearch/es52/aggs.py | 91 ++- vendor/jx_elasticsearch/es52/decoders.py | 138 ++-- vendor/jx_elasticsearch/es52/deep.py | 35 +- vendor/jx_elasticsearch/es52/expressions.py | 558 +++++++------- vendor/jx_elasticsearch/es52/format.py | 18 +- vendor/jx_elasticsearch/es52/setop.py | 31 +- vendor/jx_elasticsearch/es52/util.py | 34 +- vendor/jx_elasticsearch/meta.py | 697 ++++++++++-------- vendor/jx_python/__init__.py | 19 +- .../containers/list_usingPythonList.py | 54 +- vendor/jx_python/jx.py | 56 +- vendor/jx_python/meta.py | 305 ++++++-- vendor/jx_python/namespace/__init__.py | 59 -- vendor/jx_python/table.py | 4 +- vendor/mo_collections/relation.py | 45 +- vendor/mo_dots/__init__.py | 8 +- vendor/mo_dots/lists.py | 28 +- vendor/mo_files/__init__.py | 12 +- vendor/mo_future/__init__.py | 11 +- vendor/mo_json_config/__init__.py | 2 + vendor/mo_kwargs/__init__.py | 3 +- vendor/mo_logs/__init__.py | 15 +- vendor/mo_logs/exceptions.py | 7 + vendor/mo_logs/log_usingElasticSearch.py | 7 +- vendor/mo_logs/log_usingStream.py | 13 +- vendor/mo_logs/log_usingThreadedStream.py | 4 +- vendor/mo_logs/startup.py | 51 +- vendor/mo_math/__init__.py | 6 + vendor/mo_testing/fuzzytestcase.py | 3 +- vendor/mo_threads/__init__.py | 21 +- vendor/mo_threads/lock.py | 4 +- vendor/{mo_logs => mo_threads}/profiles.py | 42 +- vendor/mo_threads/queues.py | 78 +- vendor/mo_threads/threads.py | 150 ++-- vendor/mo_threads/till.py | 32 +- vendor/mo_times/dates.py | 12 +- vendor/mo_times/timer.py | 12 +- vendor/pyLibrary/aws/s3.py | 7 +- vendor/pyLibrary/convert.py | 9 + vendor/pyLibrary/env/elasticsearch.py | 347 +++++---- vendor/pyLibrary/env/flask_wrappers.py | 7 +- vendor/pyLibrary/env/http.py | 22 +- vendor/pyLibrary/env/typed_inserter.py | 13 +- vendor/pyLibrary/queries/__init__.py | 0 vendor/pyLibrary/queries/jx_usingMySQL.py | 458 ------------ vendor/pyLibrary/sql/mysql.py | 8 +- vendor/pyLibrary/sql/sqlite.py | 67 +- 70 files changed, 2864 insertions(+), 2504 deletions(-) create mode 100644 vendor/jx_base/README.md create mode 100644 vendor/jx_base/facts.py create mode 100644 vendor/jx_base/namespace.py create mode 100644 vendor/jx_base/snowflake.py create mode 100644 vendor/jx_base/table.py rename vendor/{mo_logs => mo_threads}/profiles.py (83%) delete mode 100644 vendor/pyLibrary/queries/__init__.py delete mode 100644 vendor/pyLibrary/queries/jx_usingMySQL.py diff --git a/vendor/jx_base/README.md b/vendor/jx_base/README.md new file mode 100644 index 0000000..a665114 --- /dev/null +++ b/vendor/jx_base/README.md @@ -0,0 +1,32 @@ + +## Some help for the programmer + +Some nomenclature is required to help follow the logic of these modules + +### Table + +Same as with database terminology; it is a single, unordered, set of rows; + +### Schema + +A set of columns that describe all the (possibly optional) properties available on all rows of a table. + +### Facts + +Represents the multiple tables in the hierarchical database + +### Snowflake + +JSON Query Expressions are used the query hierarchical databases. The relations in a hierarchical database are limited to a tree; the path between any two tables is unique; in a query, no matter which table is "origin", any column in the hierarchical database can be accessed using a unique combination of joins with the origin. + +With this in mind, a Snowflake is a list of all columns, for all the tables, in the hierarchical database. + +### Container + +Datastore that has multiple facts + +### Namespace + +Metadata for a container: Information on multiple snowflakes. + + \ No newline at end of file diff --git a/vendor/jx_base/__init__.py b/vendor/jx_base/__init__.py index 25e94ef..60af4e9 100644 --- a/vendor/jx_base/__init__.py +++ b/vendor/jx_base/__init__.py @@ -14,14 +14,11 @@ from __future__ import unicode_literals from collections import Mapping from uuid import uuid4 -from mo_json import value2json - -from mo_logs.strings import expand_template, quote - -from mo_logs import Log - from mo_dots import NullType, Data, FlatList, wrap, coalesce, listwrap from mo_future import text_type, none_type, PY2 +from mo_json import value2json +from mo_logs import Log +from mo_logs.strings import expand_template, quote from mo_times import Date IS_NULL = '0' @@ -39,7 +36,7 @@ STRUCT = [EXISTS, OBJECT, NESTED] python_type_to_json_type = { - int: INTEGER, + int: NUMBER, text_type: STRING, float: NUMBER, None: OBJECT, @@ -223,7 +220,7 @@ class {{class_name}}(Mapping): return _exec(code, name) -class Table(DataClass( +class TableDesc(DataClass( "Table", [ "name", @@ -241,6 +238,7 @@ class Table(DataClass( # return singlton.get_columns(table_name=self.name) + Column = DataClass( "Column", [ @@ -248,8 +246,8 @@ Column = DataClass( "names", # MAP FROM TABLE NAME TO COLUMN NAME (ONE COLUMN CAN HAVE MULTIPLE NAMES) "es_column", "es_index", - # "es_type", - "type", + "es_type", + {"name": "jx_type", "nulls": True}, {"name": "useSource", "default": False}, {"name": "nested_path", "nulls": True}, # AN ARRAY OF PATHS (FROM DEEPEST TO SHALLOWEST) INDICATING THE JSON SUB-ARRAYS {"name": "count", "nulls": True}, @@ -262,3 +260,11 @@ Column = DataClass( {"eq": [{"last": "nested_path"}, {"literal": "."}]} ]} ) + + +from jx_base.container import Container +from jx_base.namespace import Namespace +from jx_base.facts import Facts +from jx_base.snowflake import Snowflake +from jx_base.table import Table +from jx_base.schema import Schema diff --git a/vendor/jx_base/container.py b/vendor/jx_base/container.py index cdfa5f7..acf233c 100644 --- a/vendor/jx_base/container.py +++ b/vendor/jx_base/container.py @@ -47,7 +47,9 @@ def _delayed_imports(): class Container(object): """ - Containers are data storage capable of handing queries on that storage + CONTAINERS HOLD MULTIPLE FACTS AND CAN HANDLE + GENERAL JSON QUERY EXPRESSIONS ON ITS CONTENTS + METADATA FOR A Container IS CALL A Namespace """ __slots__ = ["data", "namespaces"] @@ -95,16 +97,6 @@ class Container(object): else: Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__) - - def __init__(self, frum, schema=None): - object.__init__(self) - if not type2container: - _delayed_imports() - - self.data = frum - if isinstance(schema, list): - Log.error("expecting map from es_column to column object") - def query(self, query): if query.frum != self: Log.error("not expected") diff --git a/vendor/jx_base/expressions.py b/vendor/jx_base/expressions.py index ddc46a1..9edaa6c 100644 --- a/vendor/jx_base/expressions.py +++ b/vendor/jx_base/expressions.py @@ -11,7 +11,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -import itertools import operator from collections import Mapping from decimal import Decimal @@ -20,7 +19,7 @@ import mo_json from jx_base import OBJECT, python_type_to_json_type, BOOLEAN, NUMBER, INTEGER, STRING, IS_NULL from jx_base.queries import is_variable_name, get_property_name from mo_dots import coalesce, wrap, Null, split_field -from mo_future import text_type, utf8_json_encoder, get_function_name +from mo_future import text_type, utf8_json_encoder, get_function_name, zip_longest from mo_json import scrub from mo_logs import Log, Except from mo_math import Math, MAX, MIN, UNION @@ -63,7 +62,7 @@ def jx_expression(expr, schema=None): if len(leaves) == 0: v.data_type = IS_NULL if len(leaves) == 1: - v.data_type = list(leaves)[0].type + v.data_type = list(leaves)[0].jx_type return output @@ -74,7 +73,9 @@ def _jx_expression(expr): if isinstance(expr, Expression): Log.error("Expecting JSON, not expression") - if expr in (True, False, None) or expr == None or isinstance(expr, (float, int, Decimal, Date)): + if expr is None: + return TRUE + elif expr in (True, False, None) or expr == None or isinstance(expr, (float, int, Decimal, Date)): return Literal(None, expr) elif isinstance(expr, text_type): return Variable(expr) @@ -262,16 +263,17 @@ class Variable(Expression): return {self} def map(self, map_): - if not isinstance(map_, Mapping): - Log.error("Expecting Mapping") - return Variable(coalesce(map_.get(self.var), self.var)) def __hash__(self): return self.var.__hash__() def __eq__(self, other): - return self.var.__eq__(other) + if isinstance(other, Variable): + return self.var == other.var + elif isinstance(other, text_type): + return self.var == other + return False def __unicode__(self): return self.var @@ -419,12 +421,13 @@ class ScriptOp(Expression): ONLY FOR WHEN YOU TRUST THE SCRIPT SOURCE """ - def __init__(self, op, script): + def __init__(self, op, script, data_type=OBJECT): Expression.__init__(self, op, None) if not isinstance(script, text_type): Log.error("expecting text of a script") self.simplified = True self.script = script + self.data_type = data_type @classmethod def define(cls, expr): @@ -498,15 +501,8 @@ class Literal(Expression): elif self.term == None: return False - Log.warning("expensive") - - from mo_testing.fuzzytestcase import assertAlmostEqual - - try: - assertAlmostEqual(self.term, other) - return True - except Exception: - return False + if isinstance(other, Literal): + return (self.term == other.term) or (self.json == other.json) def __data__(self): return {"literal": self.value} @@ -553,6 +549,7 @@ class Literal(Expression): def partial_eval(self): return self ZERO = Literal("literal", 0) +ONE = Literal("literal", 1) class NullOp(Literal): @@ -721,7 +718,10 @@ class DateOp(Literal): def __init__(self, op, term): if hasattr(self, "date"): return - self.date = term + if isinstance(term, text_type): + self.date = term + else: + self.date = coalesce(term.literal, term) v = unicode2Date(self.date) if isinstance(v, Date): Literal.__init__(self, op, v.unix) @@ -928,7 +928,11 @@ class FloorOp(Expression): def __init__(self, op, terms, default=NULL): Expression.__init__(self, op, terms) - self.lhs, self.rhs = terms + if len(terms) == 1: + self.lhs = terms[0] + self.rhs = ONE + else: + self.lhs, self.rhs = terms self.default = default def __data__(self): @@ -984,6 +988,11 @@ class EqOp(Expression): else: return {"eq": [self.lhs.__data__(), self.rhs.__data__()]} + def __eq__(self, other): + if isinstance(other, EqOp): + return self.lhs == other.lhs and self.rhs == other.rhs + return False + def vars(self): return self.lhs.vars() | self.rhs.vars() @@ -1135,6 +1144,11 @@ class AndOp(Expression): def __data__(self): return {"and": [t.__data__() for t in self.terms]} + def __eq__(self, other): + if isinstance(other, AndOp): + return all(a == b for a, b in zip_longest(self.terms, other.terms)) + return False + def vars(self): output = set() for t in self.terms: @@ -1149,53 +1163,46 @@ class AndOp(Expression): @simplified def partial_eval(self): - terms = [] - ors = [] - for t in self.terms: + or_terms = [[]] # LIST OF TUPLES FOR or-ing and and-ing + for i, t in enumerate(self.terms): simple = BooleanOp("boolean", t).partial_eval() if simple is TRUE: - pass + continue elif simple is FALSE: return FALSE elif isinstance(simple, AndOp): - terms.extend([tt for tt in simple.terms if tt not in terms]) + for and_terms in or_terms: + and_terms.extend([tt for tt in simple.terms if tt not in and_terms]) + continue elif isinstance(simple, OrOp): - ors.append(simple.terms) + or_terms = [ + and_terms + [o] + for o in simple.terms + for and_terms in or_terms + ] + continue elif simple.type != BOOLEAN: Log.error("expecting boolean value") - elif NotOp("not", simple).partial_eval() in terms: - return FALSE - elif simple not in terms: - terms.append(simple) - if len(ors) == 0: - if len(terms) == 0: + + for and_terms in list(or_terms): + if NotOp("not", simple).partial_eval() in and_terms: + or_terms.remove(and_terms) + elif simple not in and_terms: + and_terms.append(simple) + + if len(or_terms) == 1: + and_terms = or_terms[0] + if len(and_terms) == 0: return TRUE - if len(terms) == 1: - return terms[0] - output = AndOp("and", terms) - return output - elif len(ors) == 1: # SOME SIMPLE COMMON FACTORING - if len(terms) == 0: - return OrOp("or", ors[0]) - elif len(terms) == 1 and terms[0] in ors[0]: - return terms[0] + elif len(and_terms) == 1: + return and_terms[0] else: - agg_terms = [] - for combo in ors[0]: - agg_terms.append( - AndOp("and", [combo]+terms).partial_eval() - ) - return OrOp("or", agg_terms).partial_eval() - elif len(terms) == 0: - return OrOp("or", ors[0]) - - agg_terms = [] - for combo in itertools.product(*ors): - agg_terms.append( - AndOp("and", list(combo)+terms).partial_eval() - ) - return OrOp("or", agg_terms) + return AndOp("and", and_terms) + return OrOp("or", [ + AndOp("and", and_terms) if len(and_terms) > 1 else and_terms[0] + for and_terms in or_terms + ]) class OrOp(Expression): data_type = BOOLEAN @@ -2390,9 +2397,9 @@ class SplitOp(Expression): ) def missing(self): - v = self.value.to_ruby(not_null=True) - find = self.find.to_ruby(not_null=True) - index = v + ".indexOf(" + find + ", " + self.start.to_ruby() + ")" + v = self.value.to_es_script(not_null=True) + find = self.find.to_es_script(not_null=True) + index = v + ".indexOf(" + find + ", " + self.start.to_es_script() + ")" return AndOp("and", [ self.default.missing(), diff --git a/vendor/jx_base/facts.py b/vendor/jx_base/facts.py new file mode 100644 index 0000000..f500e47 --- /dev/null +++ b/vendor/jx_base/facts.py @@ -0,0 +1,27 @@ +# encoding: utf-8 +# +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http:# mozilla.org/MPL/2.0/. +# +# Author: Kyle Lahnakoski (kyle@lahnakoski.com) +# +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + + +class Facts(object): + """ + REPRESENT A HIERARCHICAL DATASTORE: MULTIPLE TABLES IN A DATABASE ALONG + WITH THE RELATIONS THAT CONNECT THEM ALL, BUT LIMITED TO A TREE + """ + + def __init__(self, container, snowflake): + self.container = container + self.snowflake = snowflake + + @property + def namespace(self): + return self.container.namespace diff --git a/vendor/jx_base/namespace.py b/vendor/jx_base/namespace.py new file mode 100644 index 0000000..628ab50 --- /dev/null +++ b/vendor/jx_base/namespace.py @@ -0,0 +1,69 @@ +# encoding: utf-8 +# +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Author: Kyle Lahnakoski (kyle@lahnakoski.com) +# +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + +from collections import Mapping + +from jx_base.query import QueryOp + + +class Namespace(object): + """ + A CONGLOMERATION OF Snowflake METADATA + A Namespace HOLDS METADATA FOR A Collection + """ + + def get_snowflake(self, fact_table): + raise NotImplementedError() + + def get_schema(self, name): + raise NotImplementedError() + + + def convert(self, expr): + raise NotImplementedError() + + def _convert_query(self, query): + output = QueryOp("from", None) + output.select = self._convert_clause(query.select) + output.where = self.convert(query.where) + output["from"] = self._convert_from(query["from"]) + output.edges = self._convert_clause(query.edges) + output.having = convert_list(self._convert_having, query.having) + output.window = convert_list(self._convert_window, query.window) + output.sort = self._convert_clause(query.sort) + output.format = query.format + + return output + + def _convert_from(self, frum): + raise NotImplementedError() + + def _convert_clause(self, clause): + raise NotImplementedError() + + def _convert_having(self, clause): + raise NotImplementedError() + + def _convert_window(self, clause): + raise NotImplementedError() + + +def convert_list(operator, operand): + if operand==None: + return None + elif isinstance(operand, Mapping): + return operator(operand) + else: + return map(operator, operand) + + diff --git a/vendor/jx_base/queries.py b/vendor/jx_base/queries.py index 8484429..fbc018c 100644 --- a/vendor/jx_base/queries.py +++ b/vendor/jx_base/queries.py @@ -15,7 +15,7 @@ from mo_future import text_type from mo_logs import Log -keyword_pattern = re.compile(r"(\w|[\\.,$])+(?:\.(\w|[\\.,$])+)*") +keyword_pattern = re.compile(r"(\w|[\\.,$-])+(?:\.(\w|[\\.,$-])+)*") def is_variable_name(value): diff --git a/vendor/jx_base/query.py b/vendor/jx_base/query.py index f75cffe..e66a390 100644 --- a/vendor/jx_base/query.py +++ b/vendor/jx_base/query.py @@ -14,23 +14,20 @@ from __future__ import unicode_literals from collections import Mapping from copy import copy -from mo_future import text_type - +import jx_base from jx_base import STRUCT -from jx_base.container import Container from jx_base.dimensions import Dimension from jx_base.domains import Domain, SetDomain, DefaultDomain from jx_base.expressions import jx_expression, Expression, Variable, LeavesOp, ScriptOp, OffsetOp, TRUE, FALSE from jx_base.queries import is_variable_name -from jx_base.schema import Schema from mo_dots import Data, relative_field, concat_field from mo_dots import coalesce, Null, set_default, unwraplist, literal_field from mo_dots import wrap, unwrap, listwrap from mo_dots.lists import FlatList +from mo_future import text_type from mo_json.typed_encoder import untype_path from mo_logs import Log -from mo_math import AND, UNION -from mo_math import Math +from mo_math import AND, UNION, Math DEFAULT_LIMIT = 10 MAX_LIMIT = 10000 @@ -62,7 +59,7 @@ class QueryOp(Expression): # return output def __init__(self, op, frum, select=None, edges=None, groupby=None, window=None, where=None, sort=None, limit=None, format=None): - if isinstance(frum, Container): + if isinstance(frum, jx_base.Table): pass else: Expression.__init__(self, op, frum) @@ -206,7 +203,7 @@ class QueryOp(Expression): return FALSE @staticmethod - def wrap(query, table, schema): + def wrap(query, container, namespace): """ NORMALIZE QUERY SO IT CAN STILL BE JSON """ @@ -214,10 +211,14 @@ class QueryOp(Expression): return query query = wrap(query) - - output = QueryOp("from", table) - output.format = query.format - output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) + table = container.get_table(query['from']) + schema = table.schema + output = QueryOp( + op="from", + frum=table, + format=query.format, + limit=Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT)) + ) if query.select or isinstance(query.select, (Mapping, list)): output.select = _normalize_selects(query.select, query.frum, schema=schema) @@ -361,7 +362,7 @@ def _normalize_select(select, frum, schema=None): canonical ) for c in frum.get_columns() - if c.type not in STRUCT + if c.jx_type not in STRUCT ]) else: Log.error("do not know what to do") @@ -773,9 +774,11 @@ def _normalize_sort(sort=None): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) - elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: + elif not s.sort and not s.value and all(d in sort_direction for d in s.values()): for v, d in s.items(): output.append({"value": jx_expression(v), "sort": sort_direction[d]}) + elif not s.sort and not s.value: + Log.error("`sort` clause must have a `value` property") else: output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)}) return output diff --git a/vendor/jx_base/schema.py b/vendor/jx_base/schema.py index 83a381d..429f6b6 100644 --- a/vendor/jx_base/schema.py +++ b/vendor/jx_base/schema.py @@ -13,12 +13,99 @@ from __future__ import unicode_literals from copy import copy -from jx_base import STRUCT, NESTED, PRIMITIVE, OBJECT, EXISTS -from mo_dots import join_field, split_field, Null, startswith_field, set_default, wrap -from mo_json.typed_encoder import unnest_path, untype_path, NESTED_TYPE +from jx_base import STRUCT, NESTED, OBJECT, EXISTS +from mo_dots import Null, startswith_field, set_default, wrap +from mo_json.typed_encoder import unnest_path, untype_path from mo_logs import Log +class Schema(object): + """ + A Schema MAPS COLUMN NAMES OF A SINGLE TABLE TO COLUMN INSTANCES THAT MATCH + """ + + def __init__(self, table_name, columns): + """ + :param table_name: A FULL NAME FOR THIS TABLE (NOT USED) + :param columns: ALL COLUMNS IN SNOWFLAKE + """ + self._columns = copy(columns) + self.table = table_name + self.query_path = "." + self.lookup, self.lookup_leaves, self.lookup_variables = _indexer(columns, self.query_path) + + def __getitem__(self, column_name): + cs = self.lookup.get(column_name) + if cs: + return list(cs) + else: + return [wrap({"es_column": column_name})] + + def items(self): + return self.lookup.items() + + def get_column(self, name, table=None): + return self.lookup[name] + + @property + def columns(self): + return self._columns + + def get_column_name(self, column): + """ + RETURN THE COLUMN NAME, FROM THE PERSPECTIVE OF THIS SCHEMA + :param column: + :return: NAME OF column + """ + return column.names[self.query_path] + + def values(self, name): + """ + RETURN VALUES FOR THE GIVEN PATH NAME + :param name: + :return: + """ + return list(self.lookup_variables.get(unnest_path(name), Null)) + + def leaves(self, name): + """ + RETURN LEAVES OF GIVEN PATH NAME + pull leaves, considering query_path and namespace + pull all first-level properties + pull leaves, including parent leaves + pull the head of any tree by name + :param name: + :return: + """ + + return list(self.lookup_leaves.get(unnest_path(name), Null)) + + def map_to_es(self): + """ + RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME + """ + full_name = self.query_path + return set_default( + { + c.names[full_name]: c.es_column + for k, cs in self.lookup.items() + # if startswith_field(k, full_name) + for c in cs if c.jx_type not in STRUCT + }, + { + c.names["."]: c.es_column + for k, cs in self.lookup.items() + # if startswith_field(k, full_name) + for c in cs if c.jx_type not in STRUCT + } + ) + + @property + def columns(self): + return copy(self._columns) + + + def _indexer(columns, query_path): all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."} @@ -29,7 +116,7 @@ def _indexer(columns, query_path): nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and - c.type not in [EXISTS, OBJECT, NESTED] and + c.es_type not in [EXISTS, OBJECT, NESTED] and (c.es_column != "_id" or full_name == "_id") ): cs = lookup_leaves.setdefault(full_name, set()) @@ -44,7 +131,7 @@ def _indexer(columns, query_path): nfp = unnest_path(cname) if ( startswith_field(nfp, full_name) and - c.type not in [EXISTS, OBJECT] and + c.es_type not in [EXISTS, OBJECT] and (c.es_column != "_id" or full_name == "_id") and startswith_field(c.nested_path[0], query_path) ): @@ -81,93 +168,3 @@ def _indexer(columns, query_path): return relative_lookup, lookup_leaves, lookup_variables - -class Schema(object): - """ - A Schema MAPS ALL COLUMNS IN SNOWFLAKE FROM NAME TO COLUMN INSTANCE - """ - - def __init__(self, table_name, columns): - """ - :param table_name: THE FACT TABLE - :param query_path: PATH TO ARM OF SNOWFLAKE - :param columns: ALL COLUMNS IN SNOWFLAKE - """ - self._columns = copy(columns) - table_path = split_field(table_name) - self.table = join_field(table_path[:1]) # USED AS AN EXPLICIT STATEMENT OF PERSPECTIVE IN THE DATABASE - query_path = join_field(table_path[1:]) # TODO: REPLACE WITH THE nested_path ARRAY - if query_path == ".": - self.query_path = query_path - else: - query_path += "."+NESTED_TYPE - self.query_path = [c for c in columns if c.type == NESTED and c.names["."] == query_path][0].es_column - self.lookup, self.lookup_leaves, self.lookup_variables = _indexer(columns, self.query_path) - - def __getitem__(self, column_name): - cs = self.lookup.get(column_name) - if cs: - return list(cs) - else: - return [wrap({"es_column": column_name})] - - def items(self): - return self.lookup.items() - - def get_column(self, name, table=None): - return self.lookup[name] - - def get_column_name(self, column): - """ - RETURN THE COLUMN NAME, FROM THE PERSPECTIVE OF THIS SCHEMA - :param column: - :return: NAME OF column - """ - return column.names[self.query_path] - - def values(self, name): - """ - RETURN VALUES FOR THE GIVEN PATH NAME - :param name: - :return: - """ - return list(self.lookup_variables.get(unnest_path(name), Null)) - - def leaves(self, name, meta=False): - """ - RETURN LEAVES OF GIVEN PATH NAME - pull leaves, considering query_path and namespace - pull all first-level properties - pull leaves, including parent leaves - pull the head of any tree by name - :param name: - :return: - """ - - return list(self.lookup_leaves.get(unnest_path(name), Null)) - - def map_to_es(self): - """ - RETURN A MAP FROM THE NAME SPACE TO THE es_column NAME - """ - full_name = self.query_path - return set_default( - { - c.names[full_name]: c.es_column - for k, cs in self.lookup.items() - # if startswith_field(k, full_name) - for c in cs if c.type not in STRUCT - }, - { - c.names["."]: c.es_column - for k, cs in self.lookup.items() - # if startswith_field(k, full_name) - for c in cs if c.type not in STRUCT - } - ) - - @property - def columns(self): - return copy(self._columns) - - diff --git a/vendor/jx_base/snowflake.py b/vendor/jx_base/snowflake.py new file mode 100644 index 0000000..4b19a79 --- /dev/null +++ b/vendor/jx_base/snowflake.py @@ -0,0 +1,30 @@ +# encoding: utf-8 +# +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http:# mozilla.org/MPL/2.0/. +# +# Author: Kyle Lahnakoski (kyle@lahnakoski.com) +# +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + + +class Snowflake(object): + """ + REPRESENT ONE ALIAS, AND ITS NESTED ARRAYS + """ + + def get_schema(self, query_path): + raise NotImplemented() + + @property + def query_paths(self): + raise NotImplemented() + + @property + def columns(self): + raise NotImplemented() + diff --git a/vendor/jx_base/table.py b/vendor/jx_base/table.py new file mode 100644 index 0000000..2605170 --- /dev/null +++ b/vendor/jx_base/table.py @@ -0,0 +1,22 @@ +# encoding: utf-8 +# +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Author: Kyle Lahnakoski (kyle@lahnakoski.com) +# +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + + +class Table(object): + + def __init__(self, full_name): + self.name = full_name + + def map(self, mapping): + return self + diff --git a/vendor/jx_elasticsearch/es09/aggop.py b/vendor/jx_elasticsearch/es09/aggop.py index e42009b..b5abdb3 100644 --- a/vendor/jx_elasticsearch/es09/aggop.py +++ b/vendor/jx_elasticsearch/es09/aggop.py @@ -11,11 +11,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals +from jx_base.expressions import Variable from jx_base.queries import is_variable_name from jx_elasticsearch import es09 from jx_elasticsearch.es09.util import aggregates, fix_es_stats, build_es_query from jx_elasticsearch import post as es_post -from jx_elasticsearch.es52.expressions import Variable +# from jx_elasticsearch.es52.expressions import Variable from jx_python.containers.cube import Cube from jx_python.expressions import jx_expression_to_function from mo_collections.matrix import Matrix diff --git a/vendor/jx_elasticsearch/es09/expressions.py b/vendor/jx_elasticsearch/es09/expressions.py index f88e409..e7378ea 100644 --- a/vendor/jx_elasticsearch/es09/expressions.py +++ b/vendor/jx_elasticsearch/es09/expressions.py @@ -15,6 +15,8 @@ from collections import Mapping from datetime import datetime import re +from jx_base.queries import keyword_pattern + from mo_future import text_type from pyLibrary import convert from mo_collections import reverse @@ -129,13 +131,13 @@ class _MVEL(object): list = [] for s in selectList: if is_deep: - if s.value and isKeyword(s.value): + if s.value and is_variable_name(s.value): shortForm = self._translate(s.value) list.append("Value2Pipe(" + shortForm + ")\n") else: Log.error("do not know how to handle yet") else: - if s.value and isKeyword(s.value): + if s.value and is_variable_name(s.value): list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n") elif s.value: shortForm = self._translate(s.value) @@ -490,19 +492,8 @@ def _where(esFilter, _translate): VAR_CHAR = "abcdefghijklmnopqurstvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_.\"" -keyword_pattern = re.compile(r"\.*\w*(?:\.\w+)*") -def isKeyword(value): - """ - RETURN TRUE IF THE value IS JUST A NAME OF A FIELD, A LIST OF FIELDS, (OR A VALUE) - """ - if not value or not isinstance(value, text_type): - Log.error("Expecting a string") - - if keyword_pattern.match(value): - return True - return False def value2MVEL(value): diff --git a/vendor/jx_elasticsearch/es09/util.py b/vendor/jx_elasticsearch/es09/util.py index d301460..6d7c4c7 100644 --- a/vendor/jx_elasticsearch/es09/util.py +++ b/vendor/jx_elasticsearch/es09/util.py @@ -13,6 +13,10 @@ from __future__ import unicode_literals from datetime import datetime +from jx_base.queries import is_variable_name + +from mo_logs.strings import quote + from mo_logs import Log, strings from mo_dots import Data from mo_dots import coalesce @@ -23,7 +27,7 @@ from mo_math import COUNT from mo_math import Math from mo_math import stats from jx_base import domains -from jx_elasticsearch.es09.expressions import value2MVEL, isKeyword +from jx_elasticsearch.es09.expressions import value2MVEL from mo_times import durations @@ -68,7 +72,7 @@ def compileTime2Term(edge): # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value - if isKeyword(value): + if is_variable_name(value): value = "doc[\"" + value + "\"].value" nullTest = compileNullTest(edge) @@ -109,7 +113,7 @@ def compileDuration2Term(edge): # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value - if isKeyword(value): + if is_variable_name(value): value = "doc[\"" + value + "\"].value" ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO) @@ -141,7 +145,7 @@ def compileNumeric2Term(edge): numPartitions = len(edge.domain.partitions) value = edge.value - if isKeyword(value): + if is_variable_name(value): value = "doc[\"" + value + "\"].value" if not edge.domain.max: @@ -179,7 +183,7 @@ def compileString2Term(edge): Log.error("edge script not supported yet") value = edge.value - if isKeyword(value): + if is_variable_name(value): value = strings.expand_template("getDocValue({{path}})", {"path": quote(value)}) else: Log.error("not handled") @@ -202,7 +206,7 @@ def compileNullTest(edge): # IS THERE A LIMIT ON THE DOMAIN? value = edge.value - if isKeyword(value): + if is_variable_name(value): value = "doc[\"" + value + "\"].value" if not edge.domain.max: @@ -240,7 +244,7 @@ def compileEdges2Term(mvel_compiler, edges, constants): def temp(term): return FlatList([edge0.domain.getPartByKey(term)]) - if edge0.value and isKeyword(edge0.value): + if edge0.value and is_variable_name(edge0.value): return Data( field=edge0.value, term2parts=temp diff --git a/vendor/jx_elasticsearch/es14/__init__.py b/vendor/jx_elasticsearch/es14/__init__.py index 523c291..bd824ab 100644 --- a/vendor/jx_elasticsearch/es14/__init__.py +++ b/vendor/jx_elasticsearch/es14/__init__.py @@ -19,22 +19,17 @@ from jx_base.dimensions import Dimension from jx_base.expressions import jx_expression from jx_base.queries import is_variable_name from jx_base.query import QueryOp -from jx_base.schema import Schema from jx_elasticsearch.es14.aggs import es_aggsop, is_aggsop from jx_elasticsearch.es14.deep import is_deepop, es_deepop from jx_elasticsearch.es14.setop import is_setop, es_setop from jx_elasticsearch.es14.util import aggregates -from jx_elasticsearch.meta import FromESMetadata +from jx_elasticsearch.meta import ElasticsearchMetadata, Table from jx_python import jx -from mo_dots import Data, Null, unwrap -from mo_dots import coalesce, split_field, literal_field, unwraplist, join_field -from mo_dots import wrap, listwrap -from mo_dots.lists import FlatList -from mo_json import scrub +from mo_dots import Data, Null, unwrap, coalesce, split_field, literal_field, unwraplist, join_field, wrap, listwrap, FlatList +from mo_json import scrub, value2json +from mo_json.typed_encoder import TYPE_PREFIX from mo_kwargs import override -from mo_logs import Log -from mo_logs.exceptions import Except -from pyLibrary import convert +from mo_logs import Log, Except from pyLibrary.env import elasticsearch, http @@ -45,7 +40,7 @@ class ES14(Container): def __new__(cls, *args, **kwargs): if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta": - output = FromESMetadata.__new__(FromESMetadata, *args, **kwargs) + output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs) output.__init__(*args, **kwargs) return output else: @@ -66,36 +61,46 @@ class ES14(Container): typed=None, kwargs=None ): - Container.__init__(self, None) + Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs - self.name = coalesce(name, alias, index) + self.name = name = coalesce(name, alias, index) if read_only: self.es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs) - self.meta = FromESMetadata(kwargs=kwargs) + self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.settings.type = self.es.settings.type self.edges = Data() self.worker = None - columns = self.meta.get_columns(table_name=coalesce(name, alias, index)) - self._schema = Schema(coalesce(name, alias, index), columns) + columns = self._namespace.get_snowflake(self._es.settings.alias).columns # ABSOLUTE COLUMNS if typed == None: # SWITCH ON TYPED MODE - self.typed = any(c.es_column.find(".$") != -1 for c in columns) + self.typed = any(c.es_column.find("."+TYPE_PREFIX) != -1 for c in columns) else: self.typed = typed @property - def schema(self): - return self._schema + def snowflake(self): + return self._namespace.get_snowflake(self._es.settings.alias) + + @property + def namespace(self): + return self._namespace + + + def get_table(self, full_name): + return Table(full_name, self) + + def get_schema(self, query_path): + return self._namespace.get_schema(query_path) def __data__(self): settings = self.settings.copy() @@ -126,13 +131,10 @@ class ES14(Container): def query(self, _query): try: - query = QueryOp.wrap(_query, _query.frum, schema=self) - - for n in self.namespaces: - query = n.convert(query) + query = QueryOp.wrap(_query, container=self, namespace=self.namespace) for s in listwrap(query.select): - if not aggregates.get(s.aggregate): + if s.aggregate != None and not aggregates.get(s.aggregate): Log.error( "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate", name=s.name, @@ -213,7 +215,7 @@ class ES14(Container): scripts.append({"doc": v.doc}) else: v = scrub(v) - scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby(schema).script(schema)}) + scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)}) if results.hits.hits: updates = [] @@ -221,7 +223,7 @@ class ES14(Container): for s in scripts: updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}}) updates.append(s) - content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8') + content = ("\n".join(value2json(c) for c in updates) + "\n") response = self.es.cluster.post( self.es.path + "/_bulk", data=content, diff --git a/vendor/jx_elasticsearch/es14/aggs.py b/vendor/jx_elasticsearch/es14/aggs.py index 7752cd1..89cfc0e 100644 --- a/vendor/jx_elasticsearch/es14/aggs.py +++ b/vendor/jx_elasticsearch/es14/aggs.py @@ -11,29 +11,26 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from mo_future import text_type - +from jx_base import EXISTS from jx_base.domains import SetDomain from jx_base.expressions import TupleOp, NULL from jx_base.query import DEFAULT_LIMIT, MAX_LIMIT from jx_elasticsearch import post as es_post -from jx_elasticsearch.es14.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder -from jx_elasticsearch.es14.decoders import DimFieldListDecoder +from jx_elasticsearch.es14.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder, DimFieldListDecoder from jx_elasticsearch.es14.expressions import split_expression_by_depth, AndOp, Variable, NullOp from jx_elasticsearch.es14.setop import get_pull_stats from jx_elasticsearch.es14.util import aggregates from jx_python import jx from jx_python.expressions import jx_expression_to_function from mo_dots import listwrap, Data, wrap, literal_field, set_default, coalesce, Null, split_field, FlatList, unwrap, unwraplist +from mo_future import text_type from mo_json.typed_encoder import encode_property from mo_logs import Log -from mo_logs.strings import quote from mo_math import Math, MAX, UNION from mo_times.timer import Timer def is_aggsop(es, query): - es.cluster.get_metadata() if query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate): return True return False @@ -60,12 +57,12 @@ def get_decoders_by_depth(query): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: - if not schema.leaves(v, meta=True): + if not schema.leaves(v.var): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: - if not schema[v]: + if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields @@ -78,7 +75,7 @@ def get_decoders_by_depth(query): try: vars_ |= edge.value.vars() - depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v)) + depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v.var)) if -1 in depths: Log.error( "Do not know of column {{column}}", @@ -102,10 +99,8 @@ def sort_edges(query, prop): ordered_edges = [] remaining_edges = getattr(query, prop) for s in query.sort: - if not isinstance(s.value, Variable): - Log.error("can only sort by terms") for e in remaining_edges: - if e.value.var == s.value.var: + if e.value == s.value: if isinstance(e.domain, SetDomain): pass # ALREADY SORTED? else: @@ -113,6 +108,9 @@ def sort_edges(query, prop): ordered_edges.append(e) remaining_edges.remove(e) break + else: + Log.error("Can not sort by {{expr}}, can only sort by an existing edge expression", expr=s.value) + ordered_edges.extend(remaining_edges) return ordered_edges @@ -136,34 +134,38 @@ def es_aggsop(es, frum, query): new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] - else: + elif s.aggregate: formula.append(s) for canonical_name, many in new_select.items(): for s in many: - es_cols = frum.schema.values(s.value.var) + columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] - for es_col in es_cols: - cn = literal_field(es_col.es_column + "_count") - canonical_names.append(cn) - es_query.aggs[cn].value_count.field = es_col.es_column - if len(es_cols) == 1: - s.pull = jx_expression_to_function(canonical_names[0] + ".value") + for column in columns: + cn = literal_field(column.es_column + "_count") + if column.jx_type == EXISTS: + canonical_names.append(cn + ".doc_count") + es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} + else: + canonical_names.append(cn+ ".value") + es_query.aggs[cn].value_count.field = column.es_column + if len(canonical_names) == 1: + s.pull = jx_expression_to_function(canonical_names[0]) else: - s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names]}) + s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") - es_query.aggs[key].percentiles.field = es_cols[0].es_column + es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") @@ -171,49 +173,49 @@ def es_aggsop(es, frum, query): Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) - es_query.aggs[key].percentiles.field = es_cols[0].es_column + es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] - for es_col in es_cols: - cn = literal_field(es_col.es_column + "_cardinality") + for column in columns: + cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) - es_query.aggs[cn].cardinality.field = es_col.es_column - if len(es_cols) == 1: + es_query.aggs[cn].cardinality.field = column.es_column + if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column + es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") - es_query.aggs[median_name].percentiles.field = es_cols[0].es_column + es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] - for es_col in es_cols: - stats_name = encode_property(es_col.es_column) + for column in columns: + stats_name = encode_property(column.es_column) - if es_col.nested_path[0] == ".": + if column.nested_path[0] == ".": es_query.aggs[stats_name] = {"terms": { - "field": es_col.es_column, + "field": column.es_column, "size": Math.min(s.limit, MAX_LIMIT) }} pulls.append(get_bucket_keys(stats_name)) else: es_query.aggs[stats_name] = { - "nested": {"path": es_col.nested_path[0]}, + "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": { - "field": es_col.es_column, + "field": column.es_column, "size": Math.min(s.limit, MAX_LIMIT) }}} } @@ -228,11 +230,11 @@ def es_aggsop(es, frum, query): for p in pulls ) else: - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE - es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column + es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): @@ -245,13 +247,13 @@ def es_aggsop(es, frum, query): else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": - es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_ruby(schema).script(schema) + es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") - es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": @@ -259,35 +261,35 @@ def es_aggsop(es, frum, query): key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) - es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" - es_query.aggs[key].cardinality.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].extended_stats.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") - es_query.aggs[median_name].percentiles.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].terms.script_field = s.value.to_ruby(schema).script(schema) + es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) - es_query.aggs[canonical_name].extended_stats.script = s.value.to_ruby(schema).script(schema) + es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 @@ -312,11 +314,7 @@ def es_aggsop(es, frum, query): es_query = wrap({ "aggs": {"_nested": set_default( - { - "nested": { - "path": schema.query_path - } - }, + {"nested": {"path": schema.query_path[0]}}, es_query )} }) @@ -442,6 +440,8 @@ def aggs_iterator(aggs, decoders, coord=True): if coord: for a, parts in _aggs_iterator(unwrap(aggs), depth - 1): coord = tuple(d.get_index(parts) for d in decoders) + if any(c is None for c in coord): + continue yield parts, coord, a else: for a, parts in _aggs_iterator(unwrap(aggs), depth - 1): diff --git a/vendor/jx_elasticsearch/es14/decoders.py b/vendor/jx_elasticsearch/es14/decoders.py index affde1c..99ed8d0 100644 --- a/vendor/jx_elasticsearch/es14/decoders.py +++ b/vendor/jx_elasticsearch/es14/decoders.py @@ -13,20 +13,20 @@ from __future__ import unicode_literals from collections import Mapping -from mo_future import text_type, binary_type - +from jx_base import STRING, NUMBER, BOOLEAN from jx_base.dimensions import Dimension from jx_base.domains import SimpleSetDomain, DefaultDomain, PARTITION -from jx_base.expressions import TupleOp +from jx_base.expressions import TupleOp, TRUE from jx_base.query import MAX_LIMIT, DEFAULT_LIMIT -from jx_elasticsearch.es14.expressions import Variable, NotOp, InOp, Literal, OrOp, AndOp, InequalityOp, LeavesOp +from jx_elasticsearch.es14.expressions import Variable, NotOp, InOp, Literal, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE from jx_python import jx -from mo_dots import set_default, coalesce, literal_field, Data, relative_field -from mo_dots import wrap +from mo_dots import wrap, set_default, coalesce, literal_field, Data, relative_field, unwraplist +from mo_future import text_type from mo_json.typed_encoder import untype_path from mo_logs import Log -from mo_math import MAX, MIN -from mo_math import Math +from mo_logs.strings import quote, expand_template +from mo_math import MAX, MIN, Math +from pyLibrary.convert import string2boolean class AggsDecoder(object): @@ -37,7 +37,7 @@ class AggsDecoder(object): # if query.groupby: # return object.__new__(DefaultDecoder, e) - if isinstance(e.value, (text_type, binary_type)): + if isinstance(e.value, text_type): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, LeavesOp): @@ -63,6 +63,9 @@ class AggsDecoder(object): limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: + if col.multi > 1 and len(col.partitions) < 6: + return object.__new__(MultivalueDecoder) + partitions = col.partitions[:limit:] if e.domain.sort==-1: partitions = list(reversed(sorted(partitions))) @@ -138,18 +141,18 @@ class SetDecoder(AggsDecoder): def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) domain = self.domain = edge.domain + self.sorted = None + self.pull = pull_functions[STRING] # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)] - edge_var = edge.value.vars() + edge_var = set(v.var for v in edge.value.vars()) if query.sort: for s in query.sort: - if not edge_var - s.value.vars(): + if not edge_var - set(v.var for v in s.value.vars()): self.sorted = {1: "asc", -1: "desc"}[s.sort] parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort}) edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts) - else: - self.sorted = None def append_query(self, es_query, start): self.start = start @@ -180,7 +183,7 @@ class SetDecoder(AggsDecoder): }}, es_query) else: terms = set_default({"terms": { - "script": value.to_ruby(self.schema).script(self.schema), + "script": value.to_es_script(self.schema).script(self.schema), "size": limit }}, es_query) @@ -206,7 +209,7 @@ class SetDecoder(AggsDecoder): return self.domain.getKeyByIndex(index) def get_value_from_row(self, row): - return row[self.start].get('key') + return self.pull(row[self.start].get('key')) def get_index(self, row): try: @@ -242,7 +245,7 @@ def _range_composer(edge, domain, es_query, to_float, schema): if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: - calc = {"script": edge.value.to_ruby(schema).script(schema)} + calc = {"script": edge.value.to_es_script(schema).script(schema)} return wrap({"aggs": { "_match": set_default( @@ -446,6 +449,44 @@ class RangeDecoder(AggsDecoder): return 1 +class MultivalueDecoder(SetDecoder): + def __init__(self, edge, query, limit): + AggsDecoder.__init__(self, edge, query, limit) + self.var = edge.value.var + self.values = query.frum.schema[edge.value.var][0].partitions + self.parts = [] + + def append_query(self, es_query, start): + self.start = start + + es_field = self.query.frum.schema.leaves(self.var)[0].es_column + es_query = wrap({"aggs": { + "_match": set_default({"terms": { + "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) + }}, es_query) + }}) + + return es_query + + def get_value_from_row(self, row): + values = row[self.start]['key'].replace("||", "\b").split("|") + if len(values) == 2: + return None + return unwraplist([v.replace("\b", "|") for v in values[1:-1]]) + + def get_index(self, row): + find = self.get_value_from_row(row) + try: + return self.parts.index(find) + except Exception: + self.parts.append(find) + return len(self.parts)-1 + + @property + def num_columns(self): + return 1 + + class ObjectDecoder(AggsDecoder): def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) @@ -535,70 +576,67 @@ class DefaultDecoder(SetDecoder): self.parts = list() self.key2index = {} self.computed_domain = False + self.script = self.edge.value.partial_eval().to_es_script(self.schema) + self.pull = pull_functions[self.script.data_type] + self.missing = self.script.miss.partial_eval() + self.exists = NotOp("not", self.missing).partial_eval() - # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM - self.sorted = None - edge_var = edge.value.vars() - for s in query.sort: - if not edge_var - s.value.vars(): - self.sorted = {1: "asc", -1: "desc"}[s.sort] + # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM + sort_candidates = [s for s in self.query.sort if s.value == self.edge.value] + if sort_candidates: + self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]} + else: + self.es_order = None def append_query(self, es_query, start): self.start = start - value = self.edge.value.partial_eval() - script = value.to_ruby(self.schema) - exists = NotOp("not", script.miss).partial_eval() if not isinstance(self.edge.value, Variable): - - output = wrap({"aggs": { - "_match": { - "filter": exists.to_esfilter(self.schema), - "aggs": { - "_filter": set_default( - {"terms": { - "script": script.expr, - "size": self.domain.limit, - "order": {"_term": self.sorted} if self.sorted else None - }}, - es_query - ) - } - }, - "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, - es_query - ) - }}) - return output - elif self.edge.value.var in [s.value.var for s in self.query.sort]: - sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0] - output = wrap({"aggs": { - "_match": set_default( - {"terms": { - "field": self.schema.leaves(self.edge.value.var)[0].es_column, - "size": self.domain.limit, - "order": {"_term": "asc" if sort_dir == 1 else "desc"} - }}, - es_query - ), - "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, - es_query - ) - }}) + if self.exists is TRUE: + # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) + output = wrap({"aggs": { + "_match": set_default( + {"terms": { + "script": self.script.expr, + "size": self.domain.limit, + "order": self.es_order + }}, + es_query + ) + }}) + else: + output = wrap({"aggs": { + "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing + "filter": self.exists.to_esfilter(self.schema), + "aggs": { + "_filter": set_default( + {"terms": { + "script": self.script.expr, + "size": self.domain.limit, + "order": self.es_order + }}, + es_query + ) + } + }, + "_missing": set_default( + {"filter": self.missing.to_esfilter(self.schema)}, + es_query + ) + }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, - "size": self.domain.limit + "size": self.domain.limit, + "order": self.es_order }}, es_query ), "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, + {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) @@ -608,7 +646,7 @@ class DefaultDecoder(SetDecoder): part = row[self.start] if part['doc_count']: if part.get('key') != None: - self.parts.append(part.get('key')) + self.parts.append(self.pull(part.get('key'))) else: self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS @@ -623,19 +661,19 @@ class DefaultDecoder(SetDecoder): if self.computed_domain: try: part = row[self.start] - return self.domain.getIndexByKey(part.get('key')) + return self.domain.getIndexByKey(self.pull(part.get('key'))) except Exception as e: Log.error("problem", cause=e) else: try: part = row[self.start] - key = part.get('key') + key = self.pull(part.get('key')) i = self.key2index.get(key) if i is None: i = len(self.parts) part = {"key": key, "dataIndex": i} - self.parts.append({"key": key, "dataIndex": i}) - self.key2index[i] = part + self.parts.append(part) + self.key2index[key] = i return i except Exception as e: Log.error("problem", cause=e) @@ -648,6 +686,7 @@ class DefaultDecoder(SetDecoder): class DimFieldListDecoder(SetDecoder): def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) + edge.allowNulls = False self.fields = edge.domain.dimension.fields self.domain = self.edge.domain self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) @@ -665,11 +704,10 @@ class DimFieldListDecoder(SetDecoder): "size": self.domain.limit }}, es_query)} }}}) - if self.edge.allowNulls: - nest.aggs._missing = set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, - es_query - ) + nest.aggs._missing = set_default( + {"filter": NotOp("not", exists).to_esfilter(self.schema)}, + es_query + ) es_query = nest if self.domain.where: @@ -696,11 +734,20 @@ class DimFieldListDecoder(SetDecoder): ) def get_index(self, row): - find = tuple(p.get("key") for p in row[self.start:self.start + self.num_columns:]) - return self.domain.getIndexByKey(find) - + part = row[self.start:self.start + len(self.fields):] + if part[0]['doc_count']==0: + return None + find = tuple(p.get("key") for p in part) + output = self.domain.getIndexByKey(find) + return output @property def num_columns(self): return len(self.fields) +pull_functions = { + STRING: lambda x: x, + NUMBER: lambda x: float(x) if x !=None else None, + BOOLEAN: string2boolean +} + diff --git a/vendor/jx_elasticsearch/es14/deep.py b/vendor/jx_elasticsearch/es14/deep.py index a6eab23..484ba27 100644 --- a/vendor/jx_elasticsearch/es14/deep.py +++ b/vendor/jx_elasticsearch/es14/deep.py @@ -11,7 +11,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from jx_base import STRUCT, NESTED, EXISTS +from jx_base import NESTED from jx_base.expressions import NULL from jx_base.query import DEFAULT_LIMIT from jx_elasticsearch import post as es_post @@ -49,8 +49,7 @@ def is_deepop(es, query): def es_deepop(es, query): schema = query.frum.schema - columns = schema.columns - query_path = schema.query_path + query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS @@ -97,7 +96,7 @@ def es_deepop(es, query): col_names = set() for c in leaves: if c.nested_path[0] == ".": - if c.type == NESTED: + if c.jx_type == NESTED: continue es_query.fields += [c.es_column] c_name = untype_path(c.names[query_path]) @@ -128,7 +127,7 @@ def es_deepop(es, query): for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": - if n.type == NESTED: + if n.jx_type == NESTED: continue es_query.fields += [n.es_column] @@ -155,14 +154,14 @@ def es_deepop(es, query): else: expr = s.value for v in expr.vars(): - for c in schema[v]: + for c in schema[v.var]: if c.nested_path[0] == ".": es_query.fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name - map_to_local = {untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT} + map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) @@ -217,3 +216,23 @@ def es_deepop(es, query): Log.error("problem formatting", e) +class MapToLocal(object): + """ + MAP FROM RELATIVE/ABSOLUTE NAMESPACE TO PYTHON THAT WILL EXTRACT RESULT + """ + def __init__(self, map_to_columns): + self.map_to_columns = map_to_columns + + def __getitem__(self, item): + return self.get(item) + + def get(self, item): + cs = self.map_to_columns[item] + if len(cs) == 0: + return "Null" + elif len(cs) == 1: + return get_pull(cs[0]) + else: + return "coalesce(" + (",".join(get_pull(c) for c in cs)) + ")" + + diff --git a/vendor/jx_elasticsearch/es14/expressions.py b/vendor/jx_elasticsearch/es14/expressions.py index d8cadcb..e3fbd18 100644 --- a/vendor/jx_elasticsearch/es14/expressions.py +++ b/vendor/jx_elasticsearch/es14/expressions.py @@ -13,22 +13,20 @@ from __future__ import unicode_literals import itertools -from mo_future import text_type - from jx_base import NUMBER, STRING, BOOLEAN, OBJECT, INTEGER from jx_base.expressions import Variable, TupleOp, LeavesOp, BinaryOp, OrOp, ScriptOp, \ WhenOp, InequalityOp, extend, Literal, NullOp, TrueOp, FalseOp, DivOp, FloorOp, \ EqOp, NeOp, NotOp, LengthOp, NumberOp, StringOp, CountOp, MultiOp, RegExpOp, CoalesceOp, MissingOp, ExistsOp, \ PrefixOp, NotLeftOp, InOp, CaseOp, AndOp, \ - ConcatOp, IsNumberOp, Expression, BasicIndexOfOp, MaxOp, MinOp, BasicEqOp, BooleanOp, IntegerOp, BasicSubstringOp, ZERO, NULL, FirstOp, FALSE, TRUE, simplified -from mo_dots import coalesce, wrap, Null, unwraplist, set_default, literal_field - + ConcatOp, IsNumberOp, Expression, BasicIndexOfOp, MaxOp, MinOp, BasicEqOp, BooleanOp, IntegerOp, BasicSubstringOp, ZERO, NULL, FirstOp, FALSE, TRUE, SuffixOp, simplified, ONE, LeftOp +from jx_elasticsearch.es14.util import es_not, es_script, es_or, es_and, es_missing +from mo_dots import coalesce, wrap, Null, set_default, literal_field +from mo_future import text_type from mo_logs import Log, suppress_exception from mo_logs.strings import expand_template, quote from mo_math import MAX, OR from pyLibrary.convert import string2regexp - TO_STRING = """ ({it -> value = {{expr}}; @@ -39,11 +37,25 @@ TO_STRING = """ })() """ -# ((Runnable)(() -> {int a=2; int b=3; System.out.println(a+b);})).run(); -# "((Runnable)((value) -> {String output=String.valueOf(value); if (output.endsWith('.0')) {return output.substring(0, output.length-2);} else return output;})).run(" + value.expr + ")" + +LIST_TO_PIPE = """ +StringBuffer output=new StringBuffer(); +for(String s : {{expr}}){ + output.append("|"); + String sep2=""; + StringTokenizer parts = new StringTokenizer(s, "|"); + while (parts.hasMoreTokens()){ + output.append(sep2); + output.append(parts.nextToken()); + sep2="||"; + }//for +}//for +output.append("|"); +return output.toString() +""" -class Ruby(Expression): +class EsScript(Expression): __slots__ = ("miss", "data_type", "expr", "many") def __init__(self, type, expr, frum, miss=None, many=False): @@ -65,16 +77,16 @@ class Ruby(Expression): """ missing = self.miss.partial_eval() if missing is FALSE: - return self.partial_eval().to_ruby(schema).expr + return self.partial_eval().to_es_script(schema).expr elif missing is TRUE: return "null" - return "(" + missing.to_ruby(schema).expr + ")?null:(" + self.expr + ")" + return "(" + missing.to_es_script(schema).expr + ")?null:(" + self.expr + ")" def to_esfilter(self, schema): - return {"script": {"script": self.script(schema)}} + return {"script": es_script(self.script(schema))} - def to_ruby(self, schema): + def to_es_script(self, schema): return self def missing(self): @@ -84,7 +96,7 @@ class Ruby(Expression): return {"script": self.script} def __eq__(self, other): - if not isinstance(other, Ruby): + if not isinstance(other, EsScript): return False elif self.expr==other.expr: return True @@ -93,9 +105,9 @@ class Ruby(Expression): @extend(BinaryOp) -def to_ruby(self, schema): - lhs = NumberOp("number", self.lhs).partial_eval().to_ruby(schema).expr - rhs = NumberOp("number", self.rhs).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + lhs = NumberOp("number", self.lhs).partial_eval().to_es_script(schema).expr + rhs = NumberOp("number", self.rhs).partial_eval().to_es_script(schema).expr script = "(" + lhs + ") " + BinaryOp.operators[self.op] + " (" + rhs + ")" missing = OrOp("or", [self.lhs.missing(), self.rhs.missing()]) @@ -105,20 +117,20 @@ def to_ruby(self, schema): **{ "then": self.default, "else": - Ruby(type=NUMBER, expr=script, frum=self) + EsScript(type=NUMBER, expr=script, frum=self) } - ).partial_eval().to_ruby(schema) + ).partial_eval().to_es_script(schema) @extend(BinaryOp) def to_esfilter(self, schema): if not isinstance(self.lhs, Variable) or not isinstance(self.rhs, Literal) or self.op in BinaryOp.operators: - return self.to_ruby(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) if self.op in ["eq", "term"]: return {"term": {self.lhs.var: self.rhs.to_esfilter(schema)}} elif self.op in ["ne", "neq"]: - return {"not": {"term": {self.lhs.var: self.rhs.to_esfilter(schema)}}} + return es_not({"term": {self.lhs.var: self.rhs.to_esfilter(schema)}}) elif self.op in BinaryOp.ineq_ops: return {"range": {self.lhs.var: {self.op: self.rhs.value}}} else: @@ -126,14 +138,14 @@ def to_esfilter(self, schema): @extend(CaseOp) -def to_ruby(self, schema): - acc = self.whens[-1].partial_eval().to_ruby(schema) +def to_es_script(self, schema): + acc = self.whens[-1].partial_eval().to_es_script(schema) for w in reversed(self.whens[0:-1]): acc = WhenOp( "when", w.when, **{"then": w.then, "else": acc} - ).partial_eval().to_ruby(schema) + ).partial_eval().to_es_script(schema) return acc @@ -150,7 +162,7 @@ def to_esfilter(self, schema): ).partial_eval().to_esfilter(schema) else: Log.error("do not know how to handle") - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ConcatOp) @@ -158,89 +170,89 @@ def to_esfilter(self, schema): if isinstance(self.value, Variable) and isinstance(self.find, Literal): return {"regexp": {self.value.var: ".*" + string2regexp(self.find.value) + ".*"}} else: - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ConcatOp) -def to_ruby(self, schema): +def to_es_script(self, schema): if len(self.terms) == 0: - return self.default.to_ruby(schema) + return self.default.to_es_script(schema) acc = [] separator = StringOp("string", self.separator).partial_eval() - sep = separator.to_ruby(schema).expr + sep = separator.to_es_script(schema).expr for t in self.terms: val = WhenOp( "when", t.missing(), **{ "then": Literal("literal", ""), - "else": Ruby(type=STRING, expr=sep + "+" + StringOp(None, t).partial_eval().to_ruby(schema).expr, frum=t) + "else": EsScript(type=STRING, expr=sep + "+" + StringOp(None, t).partial_eval().to_es_script(schema).expr, frum=t) # "else": ConcatOp("concat", [sep, t]) } ) - acc.append("(" + val.partial_eval().to_ruby(schema).expr + ")") - expr_ = "(" + "+".join(acc) + ").substring(" + LengthOp("length", separator).to_ruby(schema).expr + ")" + acc.append("(" + val.partial_eval().to_es_script(schema).expr + ")") + expr_ = "(" + "+".join(acc) + ").substring(" + LengthOp("length", separator).to_es_script(schema).expr + ")" if isinstance(self.default, NullOp): - return Ruby( + return EsScript( miss=self.missing(), type=STRING, expr=expr_, frum=self ) else: - return Ruby( + return EsScript( miss=self.missing(), type=STRING, - expr="((" + expr_ + ").length==0) ? (" + self.default.to_ruby(schema).expr + ") : (" + expr_ + ")", + expr="((" + expr_ + ").length==0) ? (" + self.default.to_es_script(schema).expr + ") : (" + expr_ + ")", frum=self ) @extend(Literal) -def to_ruby(self, schema): +def to_es_script(self, schema): def _convert(v): if v is None: - return NULL.to_ruby(schema) + return NULL.to_es_script(schema) if v is True: - return Ruby( + return EsScript( type=BOOLEAN, expr="true", frum=self ) if v is False: - return Ruby( + return EsScript( type=BOOLEAN, expr="false", frum=self ) if isinstance(v, text_type): - return Ruby( + return EsScript( type=STRING, expr=quote(v), frum=self ) if isinstance(v, int): - return Ruby( + return EsScript( type=INTEGER, expr=text_type(v), frum=self ) if isinstance(v, float): - return Ruby( + return EsScript( type=NUMBER, expr=text_type(v), frum=self ) if isinstance(v, dict): - return Ruby( + return EsScript( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self ) if isinstance(v, (list, tuple)): - return Ruby( + return EsScript( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self @@ -250,16 +262,16 @@ def to_ruby(self, schema): @extend(CoalesceOp) -def to_ruby(self, schema): +def to_es_script(self, schema): if not self.terms: - return NULL.to_ruby(schema) + return NULL.to_es_script(schema) v = self.terms[-1] - acc = FirstOp("first", v).partial_eval().to_ruby(schema) + acc = FirstOp("first", v).partial_eval().to_es_script(schema) for v in reversed(self.terms[:-1]): m = v.missing().partial_eval() - e = NotOp("not", m).partial_eval().to_ruby(schema) - r = FirstOp("first", v).partial_eval().to_ruby(schema) + e = NotOp("not", m).partial_eval().to_es_script(schema) + r = FirstOp("first", v).partial_eval().to_es_script(schema) if r.miss is TRUE: continue @@ -275,7 +287,7 @@ def to_ruby(self, schema): else: new_type = OBJECT - acc = Ruby( + acc = EsScript( miss=AndOp("and", [acc.miss, m]).partial_eval(), type=new_type, expr="(" + e.expr + ") ? (" + r.expr + ") : (" + acc.expr + ")", @@ -290,8 +302,8 @@ def to_esfilter(self, schema): @extend(ExistsOp) -def to_ruby(self, schema): - return self.field.exists().partial_eval().to_ruby(schema) +def to_es_script(self, schema): + return self.field.exists().partial_eval().to_es_script(schema) @extend(ExistsOp) @@ -305,8 +317,8 @@ def to_esfilter(self, schema): @extend(NullOp) -def to_ruby(self, schema): - return Ruby( +def to_es_script(self, schema): + return EsScript( miss=TRUE, type=OBJECT, expr="null", @@ -315,17 +327,17 @@ def to_ruby(self, schema): @extend(NullOp) def to_esfilter(self, schema): - return {"not": {"match_all": {}}} + return es_not({"match_all": {}}) @extend(FalseOp) -def to_ruby(self, schema): - return Ruby(type=BOOLEAN, expr="false", frum=self) +def to_es_script(self, schema): + return EsScript(type=BOOLEAN, expr="false", frum=self) @extend(FalseOp) def to_esfilter(self, schema): - return {"not": {"match_all": {}}} + return MATCH_NONE @extend(TupleOp) @@ -333,8 +345,21 @@ def to_esfilter(self, schema): Log.error("not supported") +@extend(TupleOp) +def to_es_script(self, schema): + terms = [FirstOp("first", t).partial_eval().to_es_script(schema) for t in self.terms] + expr = 'new Object[]{'+','.join(t.expr for t in terms)+'}' + return EsScript( + type=OBJECT, + expr=expr, + miss=FALSE, + many=FALSE, + frum=self + ) + + @extend(LeavesOp) -def to_ruby(self, schema): +def to_es_script(self, schema): Log.error("not supported") @@ -344,9 +369,9 @@ def to_esfilter(self, schema): @extend(InequalityOp) -def to_ruby(self, schema): - lhs = NumberOp("number", self.lhs).partial_eval().to_ruby(schema).expr - rhs = NumberOp("number", self.rhs).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + lhs = NumberOp("number", self.lhs).partial_eval().to_es_script(schema).expr + rhs = NumberOp("number", self.rhs).partial_eval().to_es_script(schema).expr script = "(" + lhs + ") " + InequalityOp.operators[self.op] + " (" + rhs + ")" output = WhenOp( @@ -355,9 +380,9 @@ def to_ruby(self, schema): **{ "then": FALSE, "else": - Ruby(type=BOOLEAN, expr=script, frum=self) + EsScript(type=BOOLEAN, expr=script, frum=self) } - ).partial_eval().to_ruby(schema) + ).partial_eval().to_es_script(schema) return output @@ -373,23 +398,26 @@ def to_esfilter(self, schema): Log.error("operator {{op|quote}} does not work on objects", op=self.op) return {"range": {lhs: {self.op: self.rhs.value}}} else: - return {"script": {"script": self.to_ruby(schema).script(schema)}} + script = self.to_es_script(schema) + if script.miss is not FALSE: + Log.error("inequality must be decisive") + return {"script": es_script(script.expr)} @extend(DivOp) -def to_ruby(self, schema): +def to_es_script(self, schema): lhs = NumberOp("number", self.lhs).partial_eval() rhs = NumberOp("number", self.rhs).partial_eval() - script = "(" + lhs.to_ruby(schema).expr + ") / (" + rhs.to_ruby(schema).expr + ")" + script = "(" + lhs.to_es_script(schema).expr + ") / (" + rhs.to_es_script(schema).expr + ")" output = WhenOp( "when", OrOp("or", [self.lhs.missing(), self.rhs.missing(), EqOp("eq", [self.rhs, ZERO])]), **{ "then": self.default, - "else": Ruby(type=NUMBER, expr=script, frum=self) + "else": EsScript(type=NUMBER, expr=script, frum=self) } - ).partial_eval().to_ruby(schema) + ).partial_eval().to_es_script(schema) return output @@ -400,20 +428,29 @@ def to_esfilter(self, schema): @extend(FloorOp) -def to_ruby(self, schema): - lhs = self.lhs.to_ruby(schema) - rhs = self.rhs.to_ruby(schema) - script = "(int)Math.floor(((double)(" + lhs + ") / (double)(" + rhs + ")).doubleValue())*(" + rhs + ")" +def to_es_script(self, schema): + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) + + if rhs.frum is ONE: + script = "(int)Math.floor(" + lhs.expr + ")" + else: + script = "Math.floor((" + lhs.expr + ") / (" + rhs.expr + "))*(" + rhs.expr + ")" output = WhenOp( "when", - OrOp("or", [self.lhs.missing(), self.rhs.missing(), EqOp("eq", [self.rhs, ZERO])]), + OrOp("or", [lhs.miss, rhs.miss, EqOp("eq", [self.rhs, ZERO])]), **{ "then": self.default, "else": - ScriptOp("script", script) + EsScript( + type=NUMBER, + expr=script, + frum=self, + miss=FALSE + ) } - ).to_ruby(schema) + ).to_es_script(schema) return output @@ -422,13 +459,21 @@ def to_esfilter(self, schema): Log.error("Logic error") +@simplified @extend(EqOp) -def to_ruby(self, schema): +def partial_eval(self): + lhs = self.lhs.partial_eval() + rhs = self.rhs.partial_eval() + return EqOp("eq", [lhs, rhs]) + + +@extend(EqOp) +def to_es_script(self, schema): return CaseOp("case", [ WhenOp("when", self.lhs.missing(), **{"then": self.rhs.missing()}), WhenOp("when", self.rhs.missing(), **{"then": FALSE}), BasicEqOp("eq", [self.lhs, self.rhs]) - ]).partial_eval().to_ruby(schema) + ]).partial_eval().to_es_script(schema) @extend(EqOp) @@ -457,26 +502,26 @@ def to_esfilter(self, schema): @extend(BasicEqOp) -def to_ruby(self, schema): - lhs = self.lhs.partial_eval().to_ruby(schema) - rhs = self.rhs.partial_eval().to_ruby(schema) +def to_es_script(self, schema): + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) if lhs.many: if rhs.many: return AndOp("and", [ - Ruby(type=BOOLEAN, expr="(" + lhs.expr + ").size()==(" + rhs.expr + ").size()", frum=self), - Ruby(type=BOOLEAN, expr="(" + rhs.expr + ").containsAll(" + lhs.expr + ")", frum=self) - ]).to_ruby(schema) + EsScript(type=BOOLEAN, expr="(" + lhs.expr + ").size()==(" + rhs.expr + ").size()", frum=self), + EsScript(type=BOOLEAN, expr="(" + rhs.expr + ").containsAll(" + lhs.expr + ")", frum=self) + ]).to_es_script(schema) else: - return Ruby(type=BOOLEAN, expr="(" + lhs.expr + ").contains(" + rhs.expr + ")", frum=self) + return EsScript(type=BOOLEAN, expr="(" + lhs.expr + ").contains(" + rhs.expr + ")", frum=self) elif rhs.many: - return Ruby( + return EsScript( type=BOOLEAN, expr="(" + rhs.expr + ").contains(" + lhs.expr + ")", frum=self ) else: - return Ruby( + return EsScript( type=BOOLEAN, expr="(" + lhs.expr + "==" + rhs.expr + ")", frum=self @@ -499,30 +544,32 @@ def to_esfilter(self, schema): else: return {"term": {lhs: rhs}} else: - return self.to_ruby(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) @extend(MissingOp) -def to_ruby(self, schema, not_null=False, boolean=True): +def to_es_script(self, schema, not_null=False, boolean=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": - return Ruby(type=BOOLEAN, expr="false", frum=self) + return EsScript(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: - return Ruby(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].isEmpty()", frum=self) + return EsScript(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].isEmpty()", frum=self) else: return AndOp("and", [ - Ruby( + EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].isEmpty()", frum=self ) for c in columns - ]).partial_eval().to_ruby(schema) + ]).partial_eval().to_es_script(schema) + elif isinstance(self.expr, Literal): + return self.expr.missing().to_es_script(schema) else: - return self.expr.missing().partial_eval().to_ruby(schema) + return self.expr.missing().partial_eval().to_es_script(schema) @extend(MissingOp) @@ -532,22 +579,22 @@ def to_esfilter(self, schema): if not cols: return {"match_all": {}} elif len(cols) == 1: - return {"missing": {"field": cols[0].es_column}} + return es_missing(cols[0].es_column) else: - return {"and": [ - {"missing": {"field": c.es_column}} for c in cols - ]} + return es_and([ + es_missing(c.es_column) for c in cols + ]) else: - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(NotLeftOp) -def to_ruby(self, schema): - v = StringOp("string", self.value).partial_eval().to_ruby(schema).expr - l = NumberOp("number", self.length).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).partial_eval().to_es_script(schema).expr + l = NumberOp("number", self.length).partial_eval().to_es_script(schema).expr expr = "(" + v + ").substring((int)Math.max(0, (int)Math.min(" + v + ".length(), " + l + ")))" - return Ruby( + return EsScript( miss=OrOp("or", [self.value.missing(), self.length.missing()]), type=STRING, expr=expr, @@ -556,13 +603,12 @@ def to_ruby(self, schema): @extend(NeOp) -def to_ruby(self, schema): - output = CaseOp("case", [ +def to_es_script(self, schema): + return CaseOp("case", [ WhenOp("when", self.lhs.missing(), **{"then": NotOp("not", self.rhs.missing())}), WhenOp("when", self.rhs.missing(), **{"then": NotOp("not", self.lhs.missing())}), NotOp("not", BasicEqOp("eq", [self.lhs, self.rhs])) - ]).partial_eval().to_ruby(schema) - return output + ]).partial_eval().to_es_script(schema) @extend(NeOp) @@ -572,16 +618,16 @@ def to_esfilter(self, schema): if len(columns) == 0: return {"match_all": {}} elif len(columns) == 1: - return {"not": {"term": {columns[0].es_column: self.rhs.value}}} + return es_not({"term": {columns[0].es_column: self.rhs.value}}) else: Log.error("column split to multiple, not handled") else: - lhs = self.lhs.partial_eval().to_ruby(schema) - rhs = self.rhs.partial_eval().to_ruby(schema) + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) if lhs.many: if rhs.many: - return wrap({"not": + return es_not( ScriptOp( "script", ( @@ -589,26 +635,26 @@ def to_esfilter(self, schema): "(" + rhs.expr + ").containsAll(" + lhs.expr + ")" ) ).to_esfilter(schema) - }) + ) else: - return wrap({"not": + return es_not( ScriptOp("script", "(" + lhs.expr + ").contains(" + rhs.expr + ")").to_esfilter(schema) - }) + ) else: if rhs.many: - return wrap({"not": + return es_not( ScriptOp("script", "(" + rhs.expr + ").contains(" + lhs.expr + ")").to_esfilter(schema) - }) + ) else: - return wrap( + return es_not( ScriptOp("script", "(" + lhs.expr + ") != (" + rhs.expr + ")").to_esfilter(schema) ) @extend(NotOp) -def to_ruby(self, schema): - return Ruby( +def to_es_script(self, schema): + return EsScript( type=BOOLEAN, - expr="!(" + self.term.to_ruby(schema).expr + ")", + expr="!(" + self.term.to_es_script(schema).expr + ")", frum=self ) @@ -623,18 +669,18 @@ def to_esfilter(self, schema): return {"exists": {"field": v}} else: operand = self.term.to_esfilter(schema) - return {"not": operand} + return es_not(operand) @extend(AndOp) -def to_ruby(self, schema): +def to_es_script(self, schema): if not self.terms: - return TRUE.to_ruby() + return TRUE.to_es_script() else: - return Ruby( + return EsScript( miss=FALSE, type=BOOLEAN, - expr=" && ".join("(" + t.to_ruby(schema).expr + ")" for t in self.terms), + expr=" && ".join("(" + t.to_es_script(schema).expr + ")" for t in self.terms), frum=self ) @@ -644,29 +690,38 @@ def to_esfilter(self, schema): if not len(self.terms): return {"match_all": {}} else: - return {"and": [t.to_esfilter(schema) for t in self.terms]} + return es_and([t.to_esfilter(schema) for t in self.terms]) @extend(OrOp) -def to_ruby(self, schema): - return Ruby( +def to_es_script(self, schema): + return EsScript( miss=FALSE, type=BOOLEAN, - expr=" || ".join("(" + t.to_ruby(schema).expr + ")" for t in self.terms if t), + expr=" || ".join("(" + t.to_es_script(schema).expr + ")" for t in self.terms if t), frum=self ) @extend(OrOp) def to_esfilter(self, schema): - return {"or": [t.to_esfilter(schema) for t in self.terms]} + # OR(x) == NOT(AND(NOT(xi) for xi in x)) + output = es_not(es_and([ + NotOp("not", t).partial_eval().to_esfilter(schema) + for t in self.terms + ])) + return output + + # WE REQUIRE EXIT-EARLY SEMANTICS, OTHERWISE EVERY EXPRESSION IS A SCRIPT EXPRESSION + # {"bool":{"should" :[a, b, c]}} RUNS IN PARALLEL + # {"bool":{"must_not":[a, b, c]}} ALSO RUNS IN PARALLEL @extend(LengthOp) -def to_ruby(self, schema): - value = StringOp("string", self.term).to_ruby(schema) +def to_es_script(self, schema): + value = StringOp("string", self.term).to_es_script(schema) missing = self.term.missing().partial_eval() - return Ruby( + return EsScript( miss=missing, type=INTEGER, expr="(" + value.expr + ").length()", @@ -675,69 +730,65 @@ def to_ruby(self, schema): @extend(FirstOp) -def to_ruby(self, schema): - term = self.term.to_ruby(schema) +def to_es_script(self, schema): + if isinstance(self.term, Variable): + columns = schema.values(self.term.var) + if len(columns) == 1: + return self.term.to_es_script(schema, many=False) + + term = self.term.to_es_script(schema) if isinstance(term.frum, CoalesceOp): - return CoalesceOp("coalesce", [t.frum for t in term.frum.terms]).to_ruby(schema) + return CoalesceOp("coalesce", [FirstOp("first", t.partial_eval().to_es_script(schema)) for t in term.frum.terms]).to_es_script(schema) if term.many: - return Ruby( + return EsScript( miss=term.miss, type=term.type, expr="(" + term.expr + ")[0]", frum=term.frum - ).to_ruby(schema) + ).to_es_script(schema) else: return term @extend(BooleanOp) -def to_ruby(self, schema): - value = self.term.to_ruby(schema) - - if isinstance(self.term, Variable): - if value.many: - expr = "!"+value.expr + ".isEmpty() && " + value.expr + "[0]==\"T\"" - else: - expr = value.expr + "==\"T\"" - return Ruby( - miss=FALSE, - type=BOOLEAN, - expr=expr, - frum=self - ) - - if value.type == BOOLEAN: - return AndOp("and", [ - ExistsOp("exists", self.term), - FirstOp("first", self.term) - ]).partial_eval().to_ruby() - +def to_es_script(self, schema): + value = self.term.to_es_script(schema) + if value.many: + return BooleanOp("boolean", EsScript( + miss=value.miss, + type=value.type, + expr="(" + value.expr + ")[0]", + frum=value.frum + )).to_es_script(schema) + elif value.type == BOOLEAN: + miss = value.miss + value.miss = FALSE + return WhenOp("when", miss, **{"then": FALSE, "else": value}).partial_eval().to_es_script(schema) else: - return ExistsOp("exists", self.term).partial_eval().to_ruby() - + return NotOp("not", value.miss).partial_eval().to_es_script(schema) @extend(BooleanOp) def to_esfilter(self, schema): if isinstance(self.term, Variable): return {"term": {self.term.var: True}} else: - return self.to_ruby(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) @extend(IntegerOp) -def to_ruby(self, schema): - value = self.term.to_ruby(schema) +def to_es_script(self, schema): + value = self.term.to_es_script(schema) if value.many: - return IntegerOp("integer", Ruby( + return IntegerOp("integer", EsScript( miss=value.missing, type=value.type, expr="(" + value.expr + ")[0]", frum=value.frum - )).to_ruby(schema) + )).to_es_script(schema) elif value.type == BOOLEAN: - return Ruby( + return EsScript( miss=value.missing, type=INTEGER, expr=value.expr + " ? 1 : 0", @@ -746,21 +797,21 @@ def to_ruby(self, schema): elif value.type == INTEGER: return value elif value.type == NUMBER: - return Ruby( + return EsScript( miss=value.missing, type=INTEGER, expr="(int)(" + value.expr + ")", frum=self ) elif value.type == STRING: - return Ruby( + return EsScript( miss=value.missing, type=INTEGER, expr="Integer.parseInt(" + value.expr + ")", frum=self ) else: - return Ruby( + return EsScript( miss=value.missing, type=INTEGER, expr="((" + value.expr + ") instanceof String) ? Integer.parseInt(" + value.expr + ") : (int)(" + value.expr + ")", @@ -768,43 +819,43 @@ def to_ruby(self, schema): ) @extend(NumberOp) -def to_ruby(self, schema): +def to_es_script(self, schema): term = FirstOp("first", self.term).partial_eval() - value = term.to_ruby(schema) + value = term.to_es_script(schema) if isinstance(value.frum, CoalesceOp): - return CoalesceOp("coalesce", [NumberOp("number", t).partial_eval().to_ruby(schema) for t in value.frum.terms]).to_ruby(schema) + return CoalesceOp("coalesce", [NumberOp("number", t).partial_eval().to_es_script(schema) for t in value.frum.terms]).to_es_script(schema) if value.type == BOOLEAN: - return Ruby( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr + " ? 1 : 0", frum=self ) elif value.type == INTEGER: - return Ruby( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr, frum=self ) elif value.type == NUMBER: - return Ruby( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr, frum=self ) elif value.type == STRING: - return Ruby( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr="Double.parseDouble(" + value.expr + ")", frum=self ) elif value.type == OBJECT: - return Ruby( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr="((" + value.expr + ") instanceof String) ? Double.parseDouble(" + value.expr + ") : (" + value.expr + ")", @@ -813,12 +864,12 @@ def to_ruby(self, schema): @extend(IsNumberOp) -def to_ruby(self, schema): - value = self.term.to_ruby(schema) +def to_es_script(self, schema): + value = self.term.to_es_script(schema) if value.expr or value.i: - return TRUE.to_ruby(schema) + return TRUE.to_es_script(schema) else: - return Ruby( + return EsScript( miss=FALSE, type=BOOLEAN, expr="(" + value.expr + ") instanceof java.lang.Double", @@ -826,11 +877,11 @@ def to_ruby(self, schema): ) @extend(CountOp) -def to_ruby(self, schema): - return Ruby( +def to_es_script(self, schema): + return EsScript( miss=FALSE, type=INTEGER, - expr="+".join("((" + t.missing().partial_eval().to_ruby(schema).expr + ") ? 0 : 1)" for t in self.terms), + expr="+".join("((" + t.missing().partial_eval().to_es_script(schema).expr + ") ? 0 : 1)" for t in self.terms), frum=self ) @@ -841,11 +892,11 @@ def to_esfilter(self, schema): @extend(MaxOp) -def to_ruby(self, schema): - acc = NumberOp("number", self.terms[-1]).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + acc = NumberOp("number", self.terms[-1]).partial_eval().to_es_script(schema).expr for t in reversed(self.terms[0:-1]): - acc = "Math.max(" + NumberOp("number", t).partial_eval().to_ruby(schema).expr + " , " + acc + ")" - return Ruby( + acc = "Math.max(" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + " , " + acc + ")" + return EsScript( miss=AndOp("or", [t.missing() for t in self.terms]), type=NUMBER, expr=acc, @@ -854,11 +905,11 @@ def to_ruby(self, schema): @extend(MinOp) -def to_ruby(self, schema): - acc = NumberOp("number", self.terms[-1]).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + acc = NumberOp("number", self.terms[-1]).partial_eval().to_es_script(schema).expr for t in reversed(self.terms[0:-1]): - acc = "Math.min(" + NumberOp("number", t).partial_eval().to_ruby(schema).expr + " , " + acc + ")" - return Ruby( + acc = "Math.min(" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + " , " + acc + ")" + return EsScript( miss=AndOp("or", [t.missing() for t in self.terms]), type=NUMBER, expr=acc, @@ -866,29 +917,38 @@ def to_ruby(self, schema): ) +_painless_operators = { + "add": (" + ", "0"), # (operator, zero-array default value) PAIR + "sum": (" + ", "0"), + "mul": (" * ", "1"), + "mult": (" * ", "1"), + "multiply": (" * ", "1") +} + + @extend(MultiOp) -def to_ruby(self, schema): - op, unit = MultiOp.operators[self.op] +def to_es_script(self, schema): + op, unit = _painless_operators[self.op] if self.nulls: calc = op.join( - "((" + t.missing().to_ruby(schema).expr + ") ? " + unit + " : (" + NumberOp("number", t).partial_eval().to_ruby(schema).expr + "))" for - t in self.terms + "((" + t.missing().to_es_script(schema).expr + ") ? " + unit + " : (" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + "))" + for t in self.terms ) return WhenOp( "when", AndOp("and", [t.missing() for t in self.terms]), - **{"then": self.default, "else": Ruby(type=NUMBER, expr=calc, frum=self)} - ).partial_eval().to_ruby(schema) + **{"then": self.default, "else": EsScript(type=NUMBER, expr=calc, frum=self)} + ).partial_eval().to_es_script(schema) else: calc = op.join( - "(" + NumberOp("number", t).to_ruby(schema).expr + ")" + "(" + NumberOp("number", t).to_es_script(schema).expr + ")" for t in self.terms ) return WhenOp( "when", OrOp("or", [t.missing() for t in self.terms]), - **{"then": self.default, "else": Ruby(type=NUMBER, expr=calc, frum=self)} - ).partial_eval().to_ruby(schema) + **{"then": self.default, "else": EsScript(type=NUMBER, expr=calc, frum=self)} + ).partial_eval().to_es_script(schema) @extend(RegExpOp) @@ -896,7 +956,7 @@ def to_esfilter(self, schema): if isinstance(self.pattern, Literal) and isinstance(self.var, Variable): cols = schema.leaves(self.var.var) if len(cols) == 0: - return {"not": {"match_all": {}}} + return MATCH_NONE elif len(cols) == 1: return {"regexp": {cols[0].es_column: self.pattern.value}} else: @@ -906,29 +966,29 @@ def to_esfilter(self, schema): @extend(StringOp) -def to_ruby(self, schema): +def to_es_script(self, schema): term = FirstOp("first", self.term).partial_eval() - value = term.to_ruby(schema) + value = term.to_es_script(schema) if isinstance(value.frum, CoalesceOp): - return CoalesceOp("coalesce", [StringOp("string", t).partial_eval() for t in value.frum.terms]).to_ruby(schema) + return CoalesceOp("coalesce", [StringOp("string", t).partial_eval() for t in value.frum.terms]).to_es_script(schema) if value.type == BOOLEAN: - return Ruby( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=value.expr + ' ? "T" : "F"', frum=self ) elif value.type == INTEGER: - return Ruby( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr="String.valueOf(" + value.expr + ")", frum=self ) elif value.type == NUMBER: - return Ruby( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=expand_template(TO_STRING, {"expr":value.expr}), @@ -937,7 +997,7 @@ def to_ruby(self, schema): elif value.type == STRING: return value else: - return Ruby( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=expand_template(TO_STRING, {"expr":value.expr}), @@ -949,8 +1009,8 @@ def to_ruby(self, schema): @extend(TrueOp) -def to_ruby(self, schema): - return Ruby(type=BOOLEAN, expr="true", frum=self) +def to_es_script(self, schema): + return EsScript(type=BOOLEAN, expr="true", frum=self) @extend(TrueOp) @@ -959,29 +1019,47 @@ def to_esfilter(self, schema): @extend(PrefixOp) -def to_ruby(self, schema): +def to_es_script(self, schema): if not self.field: return "true" else: - return "(" + self.field.to_ruby(schema) + ").startsWith(" + self.prefix.to_ruby(schema) + ")" + return "(" + self.field.to_es_script(schema) + ").startsWith(" + self.prefix.to_es_script(schema) + ")" @extend(PrefixOp) def to_esfilter(self, schema): - if not self.field: + if not self.expr: return {"match_all": {}} - elif isinstance(self.field, Variable) and isinstance(self.prefix, Literal): - var = schema.leaves(self.field.var)[0].es_column + elif isinstance(self.expr, Variable) and isinstance(self.prefix, Literal): + var = schema.leaves(self.expr.var)[0].es_column return {"prefix": {var: self.prefix.value}} else: - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) + +@extend(SuffixOp) +def to_es_script(self, schema): + if not self.suffix: + return "true" + else: + return "(" + self.expr.to_es_script(schema) + ").endsWith(" + self.suffix.to_es_script(schema) + ")" + + +@extend(SuffixOp) +def to_esfilter(self, schema): + if not self.suffix: + return {"match_all": {}} + elif isinstance(self.expr, Variable) and isinstance(self.suffix, Literal): + var = schema.leaves(self.expr.var)[0].es_column + return {"regexp": {var: ".*"+string2regexp(self.suffix.value)}} + else: + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(InOp) -def to_ruby(self, schema): - superset = self.superset.to_ruby(schema) - value = self.value.to_ruby(schema) - return Ruby( +def to_es_script(self, schema): + superset = self.superset.to_es_script(schema) + value = self.value.to_es_script(schema) + return EsScript( type=BOOLEAN, expr="(" + superset.expr + ").contains(" + value.expr + ")", frum=self @@ -997,26 +1075,26 @@ def to_esfilter(self, schema): var = cols[0].es_column return {"terms": {var: self.superset.value}} else: - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ScriptOp) -def to_ruby(self, schema): - return Ruby(type=OBJECT, expr=self.script) +def to_es_script(self, schema): + return EsScript(type=self.data_type, expr=self.script, frum=self) @extend(ScriptOp) def to_esfilter(self, schema): - return {"script": {"script": self.script}} + return {"script": es_script(self.script)} @extend(Variable) -def to_ruby(self, schema): +def to_es_script(self, schema, many=True): if self.var == ".": return "_source" else: if self.var == "_id": - return Ruby(type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) + return EsScript(type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) columns = schema.values(self.var) acc = [] @@ -1024,42 +1102,51 @@ def to_ruby(self, schema): varname = c.es_column frum = Variable(c.es_column) q = quote(varname) - acc.append(Ruby( - miss=frum.missing(), - type=c.type, - expr="doc[" + q + "].values", - frum=frum, - many=True + if many: + acc.append(EsScript( + miss=frum.missing(), + type=c.jx_type, + expr="doc[" + q + "].values" if c.jx_type != BOOLEAN else "doc[" + q + "].value==\"T\"", + frum=frum, + many=True + )) + else: + acc.append(EsScript( + miss=frum.missing(), + type=c.jx_type, + expr="doc[" + q + "].value" if c.jx_type != BOOLEAN else "doc[" + q + "].value==\"T\"", + frum=frum, + many=True )) if len(acc) == 0: - return NULL.to_ruby(schema) + return NULL.to_es_script(schema) elif len(acc) == 1: return acc[0] else: - return CoalesceOp("coalesce", acc).to_ruby(schema) + return CoalesceOp("coalesce", acc).to_es_script(schema) @extend(WhenOp) -def to_ruby(self, schema): +def to_es_script(self, schema): if self.simplified: - when = self.when.to_ruby(schema) - then = self.then.to_ruby(schema) - els_ = self.els_.to_ruby(schema) + when = self.when.to_es_script(schema) + then = self.then.to_es_script(schema) + els_ = self.els_.to_es_script(schema) if when is TRUE: return then elif when is FALSE: return els_ elif then.miss is TRUE: - return Ruby( + return EsScript( miss=self.missing(), type=els_.type, expr=els_.expr, frum=self ) elif els_.miss is TRUE: - return Ruby( + return EsScript( miss=self.missing(), type=then.type, expr=then.expr, @@ -1067,14 +1154,14 @@ def to_ruby(self, schema): ) elif then.type == els_.type: - return Ruby( + return EsScript( miss=self.missing(), type=then.type, expr="(" + when.expr + ") ? (" + then.expr + ") : (" + els_.expr + ")", frum=self ) elif then.type in (INTEGER, NUMBER) and els_.type in (INTEGER, NUMBER): - return Ruby( + return EsScript( miss=self.missing(), type=NUMBER, expr="(" + when.expr + ") ? (" + then.expr + ") : (" + els_.expr + ")", @@ -1083,7 +1170,7 @@ def to_ruby(self, schema): else: Log.error("do not know how to handle") else: - return self.partial_eval().to_ruby(schema) + return self.partial_eval().to_es_script(schema) @extend(WhenOp) @@ -1097,12 +1184,12 @@ def to_esfilter(self, schema): @extend(BasicIndexOfOp) -def to_ruby(self, schema): - v = StringOp("string", self.value).to_ruby(schema).expr - find = StringOp("string", self.find).to_ruby(schema).expr - start = IntegerOp("integer", self.start).to_ruby(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).to_es_script(schema).expr + find = StringOp("string", self.find).to_es_script(schema).expr + start = IntegerOp("integer", self.start).to_es_script(schema).expr - return Ruby( + return EsScript( miss=FALSE, type=INTEGER, expr="(" + v + ").indexOf(" + find + ", " + start + ")", @@ -1112,16 +1199,16 @@ def to_ruby(self, schema): @extend(BasicIndexOfOp) def to_esfilter(self, schema): - return ScriptOp("", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(BasicSubstringOp) -def to_ruby(self, schema): - v = StringOp("string", self.value).partial_eval().to_ruby(schema).expr - start = IntegerOp("string", self.start).partial_eval().to_ruby(schema).expr - end = IntegerOp("integer", self.end).partial_eval().to_ruby(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).partial_eval().to_es_script(schema).expr + start = IntegerOp("string", self.start).partial_eval().to_es_script(schema).expr + end = IntegerOp("integer", self.end).partial_eval().to_es_script(schema).expr - return Ruby( + return EsScript( miss=FALSE, type=STRING, expr="(" + v + ").substring(" + start + ", " + end + ")", @@ -1131,7 +1218,7 @@ def to_ruby(self, schema): MATCH_ALL = wrap({"match_all": {}}) -MATCH_NONE = wrap({"not": {"match_all": {}}}) +MATCH_NONE = es_not({"match_all": {}}) def simplify_esfilter(esfilter): @@ -1159,8 +1246,8 @@ def _normalize(esfilter): while isDiff: isDiff = False - if esfilter.bool.must: - terms = esfilter.bool.must + if esfilter['and']: + terms = esfilter['and'] for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 == i1: continue # SAME, IGNORE @@ -1201,10 +1288,10 @@ def _normalize(esfilter): continue if a == MATCH_NONE: return MATCH_NONE - if a.bool.must: + if a['and']: isDiff = True a.isNormal = None - output.extend(a.bool.must) + output.extend(a['and']) else: a.isNormal = None output.append(a) @@ -1215,7 +1302,7 @@ def _normalize(esfilter): esfilter = output[0] break elif isDiff: - esfilter = wrap({"and": output}) + esfilter = es_and(output) continue if esfilter.bool.should: @@ -1255,18 +1342,14 @@ def _normalize(esfilter): if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: - return { - "or": [ - {"missing": {"field": k}}, - {"terms": {k: rest}} - ], - "isNormal": True - } + output = es_or([ + es_missing(k), + {"terms": {k: rest}} + ]) else: - return { - "missing": {"field": k}, - "isNormal": True - } + output = es_missing(k) + output.isNormal = True + return output else: esfilter.isNormal = True return esfilter @@ -1308,18 +1391,18 @@ def split_expression_by_depth(where, schema, output=None, var_to_depth=None): if not vars_: return Null # MAP VARIABLE NAMES TO HOW DEEP THEY ARE - var_to_depth = {v: len(c.nested_path) - 1 for v in vars_ for c in schema.values(v)} + var_to_depth = {v.var: max(len(c.nested_path) - 1, 0) for v in vars_ for c in schema[v.var]} all_depths = set(var_to_depth.values()) - if -1 in all_depths: - Log.error( - "Can not find column with name {{column|quote}}", - column=unwraplist([k for k, v in var_to_depth.items() if v == -1]) - ) + # if -1 in all_depths: + # Log.error( + # "Can not find column with name {{column|quote}}", + # column=unwraplist([k for k, v in var_to_depth.items() if v == -1]) + # ) if len(all_depths) == 0: all_depths = {0} output = wrap([[] for _ in range(MAX(all_depths) + 1)]) else: - all_depths = set(var_to_depth[v] for v in vars_) + all_depths = set(var_to_depth[v.var] for v in vars_) if len(all_depths) == 1: output[list(all_depths)[0]] += [where] @@ -1336,10 +1419,10 @@ def get_type(var_name): type_ = var_name.split(".$")[1:] if not type_: return "j" - return json_type_to_painless_type.get(type_[0], "j") + return json_type_to_es_script_type.get(type_[0], "j") -json_type_to_painless_type = { +json_type_to_es_script_type = { "string": "s", "boolean": "b", "number": "n" diff --git a/vendor/jx_elasticsearch/es14/format.py b/vendor/jx_elasticsearch/es14/format.py index afb7345..86618e3 100644 --- a/vendor/jx_elasticsearch/es14/format.py +++ b/vendor/jx_elasticsearch/es14/format.py @@ -11,17 +11,15 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from collections import Mapping - -from mo_dots import Data, set_default, wrap, split_field, coalesce -from mo_logs import Log -from pyLibrary import convert - from jx_base.expressions import TupleOp from jx_elasticsearch.es14.aggs import count_dim, aggs_iterator, format_dispatch, drill from jx_python.containers.cube import Cube from mo_collections.matrix import Matrix +from mo_dots import Data, set_default, wrap, split_field, coalesce +from mo_future import sort_using_key +from mo_logs import Log from mo_logs.strings import quote +from pyLibrary import convert FunctionType = type(lambda: 1) @@ -51,7 +49,7 @@ def format_cube(decoders, aggs, start, query, select): cube = Cube( query.select, - sorted(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY + sort_using_key(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY {s.name: m for s, m in matricies} ) cube.frum = query @@ -184,7 +182,7 @@ def format_list_from_groupby(decoders, aggs, start, query, select): continue output = Data() for g, d in zip(query.groupby, decoders): - output[g.put.name] = d.get_value_from_row(row) + output[coalesce(g.put.name, g.name)] = d.get_value_from_row(row) for s in select: output[s.name] = s.pull(agg) @@ -210,7 +208,7 @@ def format_list(decoders, aggs, start, query, select): if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS - for row, coord, agg in aggs_iterator(aggs, decoders): + for _, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION @@ -232,7 +230,7 @@ def format_list(decoders, aggs, start, query, select): output[s.name] = s.pull(agg) yield output else: - is_sent = Matrix(dims=dims, zeros=0) + for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 @@ -286,12 +284,6 @@ def format_list_from_aggop(decoders, aggs, start, query, select): }) - - - - - - def format_line(decoders, aggs, start, query, select): list = format_list(decoders, aggs, start, query, select) diff --git a/vendor/jx_elasticsearch/es14/setop.py b/vendor/jx_elasticsearch/es14/setop.py index 0bb8133..8671cbc 100644 --- a/vendor/jx_elasticsearch/es14/setop.py +++ b/vendor/jx_elasticsearch/es14/setop.py @@ -19,12 +19,11 @@ from jx_base.expressions import IDENTITY from jx_base.query import DEFAULT_LIMIT from jx_elasticsearch import post as es_post from jx_elasticsearch.es14.expressions import Variable, LeavesOp -from jx_elasticsearch.es14.util import jx_sort_to_es_sort, es_query_template +from jx_elasticsearch.es14.util import jx_sort_to_es_sort, es_query_template, es_and, es_or, es_not, es_script from jx_python.containers.cube import Cube from jx_python.expressions import jx_expression_to_function from mo_collections.matrix import Matrix -from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field -from mo_dots import listwrap +from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field, listwrap from mo_dots.lists import FlatList from mo_json.typed_encoder import untype_path, unnest_path, untyped from mo_logs import Log @@ -56,7 +55,7 @@ def is_setop(es, query): def es_setop(es, query): schema = query.frum.schema - es_query, filters = es_query_template(schema.query_path) + es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) @@ -78,7 +77,7 @@ def es_setop(es, query): leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) - if c.type == NESTED: + if c.jx_type == NESTED: es_query.fields = ["_source"] new_select.append({ "name": full_name, @@ -88,7 +87,7 @@ def es_setop(es, query): }) put_index += 1 elif c.nested_path[0] != ".": - es_query.fields = ["_source"] + pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.fields += [c.es_column] new_select.append({ @@ -103,7 +102,7 @@ def es_setop(es, query): leaves = schema.leaves(s_column) nested_selects = {} if leaves: - if any(c.type == NESTED for c in leaves): + if s_column == '.' or any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.fields = ["_source"] for c in leaves: @@ -120,7 +119,7 @@ def es_setop(es, query): for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) - if c.type == NESTED: + if c.jx_type == NESTED: es_query.fields = ["_source"] new_select.append({ "name": select.name, @@ -144,7 +143,7 @@ def es_setop(es, query): filters[0][k] = None set_default( filters[0], - {"and": [where, {"or": nested_filter}]} + es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] @@ -156,7 +155,7 @@ def es_setop(es, query): where.nested.inner_hits._source = False where.nested.inner_hits.fields += [c.es_column] - child = relative_field(untype_path(c.names[schema.query_path]), s_column) + child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, @@ -169,7 +168,7 @@ def es_setop(es, query): "pull": pull }) else: - nested_selects[nested_path].nested.inner_hits.fields+=[c.es_column] + nested_selects[nested_path].nested.inner_hits.fields += [c.es_column] else: new_select.append({ "name": select.name, @@ -178,9 +177,8 @@ def es_setop(es, query): }) put_index += 1 else: - painless = select.value.partial_eval().to_ruby(schema) - es_query.script_fields[literal_field(select.name)] = {"script": painless.script(schema)} - + painless = select.value.partial_eval().to_es_script(schema) + es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), @@ -345,6 +343,7 @@ set_default(format_dispatch, { "list": (format_list, None, "application/json") }) + def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) diff --git a/vendor/jx_elasticsearch/es14/util.py b/vendor/jx_elasticsearch/es14/util.py index 02b0e3d..529a7b1 100644 --- a/vendor/jx_elasticsearch/es14/util.py +++ b/vendor/jx_elasticsearch/es14/util.py @@ -11,6 +11,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals +from mo_future import text_type + +from mo_logs import Log + from jx_base import STRING, BOOLEAN, NUMBER, OBJECT from jx_elasticsearch.es14.expressions import Variable from mo_dots import wrap @@ -23,18 +27,21 @@ def es_query_template(path): :return: """ + if not isinstance(path, text_type): + Log.error("expecting path to be a string") + if path != ".": f0 = {} f1 = {} output = wrap({ - "query": {"filtered": {"filter": {"and":[ + "query": {"filtered": {"filter": es_and([ f0, {"nested": { "path": path, "filter": f1, "inner_hits": {"size": 100000} }} - ]}}}, + ])}}, "from": 0, "size": 0, "sort": [] @@ -43,7 +50,7 @@ def es_query_template(path): else: f0 = {} output = wrap({ - "query": {"filtered": {"filter": f0}}, + "query": {"filtered": {"filter": es_and([f0])}}, "from": 0, "size": 0, "sort": [] @@ -66,7 +73,7 @@ def jx_sort_to_es_sort(sort, schema): for type in types: for c in cols: - if c.type == type: + if c.jx_type == type: if s.sort == -1: output.append({c.es_column: "desc"}) else: @@ -109,3 +116,22 @@ aggregates = { NON_STATISTICAL_AGGS = {"none", "one"} + +def es_and(terms): + return wrap({"and": terms}) + + +def es_or(terms): + return wrap({"or": terms}) + + +def es_not(term): + return wrap({"not": term}) + + +def es_script(term): + return wrap({"script": term}) + + +def es_missing(term): + return {"missing": {"field": term}} diff --git a/vendor/jx_elasticsearch/es52/__init__.py b/vendor/jx_elasticsearch/es52/__init__.py index a26c5cf..6287d14 100644 --- a/vendor/jx_elasticsearch/es52/__init__.py +++ b/vendor/jx_elasticsearch/es52/__init__.py @@ -19,12 +19,11 @@ from jx_base.dimensions import Dimension from jx_base.expressions import jx_expression from jx_base.queries import is_variable_name from jx_base.query import QueryOp -from jx_base.schema import Schema from jx_elasticsearch.es52.aggs import es_aggsop, is_aggsop from jx_elasticsearch.es52.deep import is_deepop, es_deepop from jx_elasticsearch.es52.setop import is_setop, es_setop from jx_elasticsearch.es52.util import aggregates -from jx_elasticsearch.meta import FromESMetadata +from jx_elasticsearch.meta import ElasticsearchMetadata, Table from jx_python import jx from mo_dots import Data, Null, unwrap, coalesce, split_field, literal_field, unwraplist, join_field, wrap, listwrap, FlatList from mo_json import scrub, value2json @@ -41,7 +40,7 @@ class ES52(Container): def __new__(cls, *args, **kwargs): if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta": - output = FromESMetadata.__new__(FromESMetadata, *args, **kwargs) + output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs) output.__init__(*args, **kwargs) return output else: @@ -62,26 +61,25 @@ class ES52(Container): typed=None, kwargs=None ): - Container.__init__(self, None) + Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.settings = kwargs - self.name = coalesce(name, alias, index) + self.name = name = coalesce(name, alias, index) if read_only: self.es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs) - self.meta = FromESMetadata(kwargs=kwargs) + self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.settings.type = self.es.settings.type self.edges = Data() self.worker = None - columns = self.meta.get_columns(table_name=coalesce(name, alias, index)) - self._schema = Schema(coalesce(name, alias, index), columns) + columns = self._namespace.get_snowflake(self._es.settings.alias).columns # ABSOLUTE COLUMNS if typed == None: # SWITCH ON TYPED MODE @@ -90,8 +88,19 @@ class ES52(Container): self.typed = typed @property - def schema(self): - return self._schema + def snowflake(self): + return self._namespace.get_snowflake(self._es.settings.alias) + + @property + def namespace(self): + return self._namespace + + + def get_table(self, full_name): + return Table(full_name, self) + + def get_schema(self, query_path): + return self._namespace.get_schema(query_path) def __data__(self): settings = self.settings.copy() @@ -122,10 +131,7 @@ class ES52(Container): def query(self, _query): try: - query = QueryOp.wrap(_query, table=self, schema=self.schema) - - for n in self.namespaces: - query = n.convert(query) + query = QueryOp.wrap(_query, container=self, namespace=self.namespace) for s in listwrap(query.select): if s.aggregate != None and not aggregates.get(s.aggregate): @@ -209,7 +215,7 @@ class ES52(Container): scripts.append({"doc": v.doc}) else: v = scrub(v) - scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_painless(schema).script(schema)}) + scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)}) if results.hits.hits: updates = [] diff --git a/vendor/jx_elasticsearch/es52/aggs.py b/vendor/jx_elasticsearch/es52/aggs.py index 4813598..faa1606 100644 --- a/vendor/jx_elasticsearch/es52/aggs.py +++ b/vendor/jx_elasticsearch/es52/aggs.py @@ -14,7 +14,7 @@ from __future__ import unicode_literals from jx_base import EXISTS from jx_base.domains import SetDomain from jx_base.expressions import TupleOp, NULL -from jx_base.query import DEFAULT_LIMIT +from jx_base.query import DEFAULT_LIMIT, MAX_LIMIT from jx_elasticsearch import post as es_post from jx_elasticsearch.es52.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder, DimFieldListDecoder from jx_elasticsearch.es52.expressions import split_expression_by_depth, AndOp, Variable, NullOp @@ -30,7 +30,6 @@ from mo_logs.strings import quote, expand_template from mo_math import Math, MAX, UNION from mo_times.timer import Timer - COMPARE_TUPLE = """ (a, b)->{ int i=0; @@ -79,7 +78,6 @@ MAX_OF_TUPLE = """ def is_aggsop(es, query): - es.cluster.get_metadata() if query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate): return True return False @@ -106,12 +104,12 @@ def get_decoders_by_depth(query): edge = edge.copy() vars_ = edge.value.vars() for v in vars_: - if not schema.leaves(v.var, meta=True): + if not schema.leaves(v.var): Log.error("{{var}} does not exist in schema", var=v) elif edge.range: vars_ = edge.range.min.vars() | edge.range.max.vars() for v in vars_: - if not schema[v]: + if not schema[v.var]: Log.error("{{var}} does not exist in schema", var=v) elif edge.domain.dimension: vars_ = edge.domain.dimension.fields @@ -148,10 +146,8 @@ def sort_edges(query, prop): ordered_edges = [] remaining_edges = getattr(query, prop) for s in query.sort: - if not isinstance(s.value, Variable): - Log.error("can only sort by terms") for e in remaining_edges: - if e.value.var == s.value.var: + if e.value == s.value: if isinstance(e.domain, SetDomain): pass # ALREADY SORTED? else: @@ -159,6 +155,9 @@ def sort_edges(query, prop): ordered_edges.append(e) remaining_edges.remove(e) break + else: + Log.error("Can not sort by {{expr}}, can only sort by an existing edge expression", expr=s.value) + ordered_edges.extend(remaining_edges) return ordered_edges @@ -187,33 +186,33 @@ def es_aggsop(es, frum, query): for canonical_name, many in new_select.items(): for s in many: - es_cols = frum.schema.values(s.value.var) + columns = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] - for es_col in es_cols: - cn = literal_field(es_col.es_column + "_count") - if es_col.type == EXISTS: + for column in columns: + cn = literal_field(column.es_column + "_count") + if column.jx_type == EXISTS: canonical_names.append(cn + ".doc_count") - es_query.aggs[cn].filter.range = {es_col.es_column: {"gt": 0}} + es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") - es_query.aggs[cn].value_count.field = es_col.es_column - if len(es_cols) == 1: + es_query.aggs[cn].value_count.field = column.es_column + if len(canonical_names) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") - es_query.aggs[key].percentiles.field = es_cols[0].es_column + es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") @@ -221,48 +220,48 @@ def es_aggsop(es, frum, query): Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) - es_query.aggs[key].percentiles.field = es_cols[0].es_column + es_query.aggs[key].percentiles.field = columns[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] - for es_col in es_cols: - cn = literal_field(es_col.es_column + "_cardinality") + for column in columns: + cn = literal_field(column.es_column + "_cardinality") canonical_names.append(cn) - es_query.aggs[cn].cardinality.field = es_col.es_column - if len(es_cols) == 1: + es_query.aggs[cn].cardinality.field = column.es_column + if len(columns) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column + es_query.aggs[stats_name].extended_stats.field = columns[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") - es_query.aggs[median_name].percentiles.field = es_cols[0].es_column + es_query.aggs[median_name].percentiles.field = columns[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] - for es_col in es_cols: + for column in columns: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', - 'map_script': 'for (v in doc['+quote(es_col.es_column)+'].values) params._agg.terms.add(v)', + 'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} - stats_name = encode_property(es_col.es_column) - if es_col.nested_path[0] == ".": + stats_name = encode_property(column.es_column) + if column.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { - "nested": {"path": es_col.nested_path[0]}, + "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) @@ -274,11 +273,11 @@ def es_aggsop(es, frum, query): else: s.pull = lambda row: UNION(p(row) for p in pulls) else: - if len(es_cols) > 1: + if len(columns) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE - es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column + es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): @@ -296,8 +295,8 @@ def es_aggsop(es, frum, query): dir = -1 op = 'min' - nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_painless(schema).expr - selfy = s.value.partial_eval().to_painless(schema).expr + nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr + selfy = s.value.partial_eval().to_es_script(schema).expr script = {"scripted_metric": { 'init_script': 'params._agg.best = ' + nully + ';', @@ -317,13 +316,13 @@ def es_aggsop(es, frum, query): else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": - es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_painless(schema).script(schema) + es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") - es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) + es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\\.0") elif s.aggregate == "percentile": @@ -331,35 +330,35 @@ def es_aggsop(es, frum, query): key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) - es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) + es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" - es_query.aggs[key].cardinality.script = s.value.to_painless(schema).script(schema) + es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].extended_stats.script = s.value.to_painless(schema).script(schema) + es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") - es_query.aggs[median_name].percentiles.script = s.value.to_painless(schema).script(schema) + es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) - es_query.aggs[stats_name].terms.script_field = s.value.to_painless(schema).script(schema) + es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) - es_query.aggs[canonical_name].extended_stats.script = s.value.to_painless(schema).script(schema) + es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 @@ -384,11 +383,7 @@ def es_aggsop(es, frum, query): es_query = wrap({ "aggs": {"_nested": set_default( - { - "nested": { - "path": schema.query_path - } - }, + {"nested": {"path": schema.query_path[0]}}, es_query )} }) diff --git a/vendor/jx_elasticsearch/es52/decoders.py b/vendor/jx_elasticsearch/es52/decoders.py index 99a7f5d..44c9325 100644 --- a/vendor/jx_elasticsearch/es52/decoders.py +++ b/vendor/jx_elasticsearch/es52/decoders.py @@ -13,21 +13,21 @@ from __future__ import unicode_literals from collections import Mapping -from mo_future import text_type, binary_type - +from jx_base import STRING, NUMBER, BOOLEAN from jx_base.dimensions import Dimension from jx_base.domains import SimpleSetDomain, DefaultDomain, PARTITION -from jx_base.expressions import TupleOp, value2json +from jx_base.expressions import TupleOp, TRUE from jx_base.query import MAX_LIMIT, DEFAULT_LIMIT -from jx_elasticsearch.es52.expressions import Variable, NotOp, InOp, Literal, OrOp, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE +from jx_elasticsearch.es52.expressions import Variable, NotOp, InOp, Literal, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE +from jx_elasticsearch.es52.util import es_missing from jx_python import jx -from mo_dots import set_default, coalesce, literal_field, Data, relative_field, unwraplist -from mo_dots import wrap +from mo_dots import wrap, set_default, coalesce, literal_field, Data, relative_field, unwraplist +from mo_future import text_type from mo_json.typed_encoder import untype_path from mo_logs import Log from mo_logs.strings import quote, expand_template -from mo_math import MAX, MIN -from mo_math import Math +from mo_math import MAX, MIN, Math +from pyLibrary.convert import string2boolean class AggsDecoder(object): @@ -144,6 +144,7 @@ class SetDecoder(AggsDecoder): AggsDecoder.__init__(self, edge, query, limit) domain = self.domain = edge.domain self.sorted = None + self.pull = pull_functions[STRING] # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM # self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)] @@ -186,7 +187,7 @@ class SetDecoder(AggsDecoder): terms = set_default({"terms": { "script": { "lang": "painless", - "inline": value.to_painless(self.schema).script(self.schema) + "inline": value.to_es_script(self.schema).script(self.schema) }, "size": limit }}, es_query) @@ -213,7 +214,7 @@ class SetDecoder(AggsDecoder): return self.domain.getKeyByIndex(index) def get_value_from_row(self, row): - return row[self.start].get('key') + return self.pull(row[self.start].get('key')) def get_index(self, row): try: @@ -249,7 +250,7 @@ def _range_composer(edge, domain, es_query, to_float, schema): if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: - calc = {"script": edge.value.to_painless(schema).script(schema)} + calc = {"script": edge.value.to_es_script(schema).script(schema)} return wrap({"aggs": { "_match": set_default( @@ -464,7 +465,7 @@ class MultivalueDecoder(SetDecoder): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column - es_query = wrap({"aggs": { + es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }}, es_query) @@ -521,7 +522,7 @@ class ObjectDecoder(AggsDecoder): "size": self.domain.limit }}, es_query), "_missing": set_default( - {"filter": {"bool": {"must_not": {"exists": {"field": v}}}}}, + {"filter": es_missing(v)}, es_query ) }}) @@ -580,73 +581,67 @@ class DefaultDecoder(SetDecoder): self.parts = list() self.key2index = {} self.computed_domain = False + self.script = self.edge.value.partial_eval().to_es_script(self.schema) + self.pull = pull_functions[self.script.data_type] + self.missing = self.script.miss.partial_eval() + self.exists = NotOp("not", self.missing).partial_eval() - # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM - self.sorted = None - edge_var = edge.value.vars() - for s in query.sort: - if not edge_var - s.value.vars(): - self.sorted = {1: "asc", -1: "desc"}[s.sort] + # WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM + sort_candidates = [s for s in self.query.sort if s.value == self.edge.value] + if sort_candidates: + self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]} + else: + self.es_order = None def append_query(self, es_query, start): self.start = start - value = self.edge.value.partial_eval() - script = value.to_painless(self.schema) - exists = NotOp("not", script.miss).partial_eval() if not isinstance(self.edge.value, Variable): - - output = wrap({"aggs": { - "_match": { - "filter": exists.to_esfilter(self.schema), - "aggs": { - "_filter": set_default( - {"terms": { - "script": { - "lang": "painless", - "inline": script.expr - }, - "size": self.domain.limit, - "order": {"_term": self.sorted} if self.sorted else None - }}, - es_query - ) - } - }, - "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, - es_query - ) - }}) - return output - elif self.edge.value.var in [s.value.var for s in self.query.sort]: - sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0] - output = wrap({"aggs": { - "_match": set_default( - {"terms": { - "field": self.schema.leaves(self.edge.value.var)[0].es_column, - "size": self.domain.limit, - "order": {"_term": "asc" if sort_dir == 1 else "desc"} - }}, - es_query - ), - "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, - es_query - ) - }}) + if self.exists is TRUE: + # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) + output = wrap({"aggs": { + "_match": set_default( + {"terms": { + "script": {"lang": "painless", "inline": self.script.expr}, + "size": self.domain.limit, + "order": self.es_order + }}, + es_query + ) + }}) + else: + output = wrap({"aggs": { + "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing + "filter": self.exists.to_esfilter(self.schema), + "aggs": { + "_filter": set_default( + {"terms": { + "script": {"lang": "painless", "inline": self.script.expr}, + "size": self.domain.limit, + "order": self.es_order + }}, + es_query + ) + } + }, + "_missing": set_default( + {"filter": self.missing.to_esfilter(self.schema)}, + es_query + ) + }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, - "size": self.domain.limit + "size": self.domain.limit, + "order": self.es_order }}, es_query ), "_missing": set_default( - {"filter": NotOp("not", exists).to_esfilter(self.schema)}, + {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) @@ -656,7 +651,7 @@ class DefaultDecoder(SetDecoder): part = row[self.start] if part['doc_count']: if part.get('key') != None: - self.parts.append(part.get('key')) + self.parts.append(self.pull(part.get('key'))) else: self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS @@ -671,19 +666,19 @@ class DefaultDecoder(SetDecoder): if self.computed_domain: try: part = row[self.start] - return self.domain.getIndexByKey(part.get('key')) + return self.domain.getIndexByKey(self.pull(part.get('key'))) except Exception as e: Log.error("problem", cause=e) else: try: part = row[self.start] - key = part.get('key') + key = self.pull(part.get('key')) i = self.key2index.get(key) if i is None: i = len(self.parts) part = {"key": key, "dataIndex": i} - self.parts.append({"key": key, "dataIndex": i}) - self.key2index[i] = part + self.parts.append(part) + self.key2index[key] = i return i except Exception as e: Log.error("problem", cause=e) @@ -755,3 +750,8 @@ class DimFieldListDecoder(SetDecoder): return len(self.fields) +pull_functions = { + STRING: lambda x: x, + NUMBER: lambda x: float(x) if x !=None else None, + BOOLEAN: string2boolean +} diff --git a/vendor/jx_elasticsearch/es52/deep.py b/vendor/jx_elasticsearch/es52/deep.py index b0da1f3..4ce1725 100644 --- a/vendor/jx_elasticsearch/es52/deep.py +++ b/vendor/jx_elasticsearch/es52/deep.py @@ -11,7 +11,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from jx_base import STRUCT, NESTED +from jx_base import NESTED from jx_base.expressions import NULL from jx_base.query import DEFAULT_LIMIT from jx_elasticsearch import post as es_post @@ -49,8 +49,7 @@ def is_deepop(es, query): def es_deepop(es, query): schema = query.frum.schema - columns = schema.columns - query_path = schema.query_path + query_path = schema.query_path[0] # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS @@ -68,7 +67,7 @@ def es_deepop(es, query): if not wheres[1]: more_filter = { "bool": { - "must": [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)], + "filter": [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)], "must_not": { "nested": { "path": query_path, @@ -103,7 +102,7 @@ def es_deepop(es, query): col_names = set() for c in leaves: if c.nested_path[0] == ".": - if c.type == NESTED: + if c.jx_type == NESTED: continue es_query.stored_fields += [c.es_column] c_name = untype_path(c.names[query_path]) @@ -134,7 +133,7 @@ def es_deepop(es, query): for n in net_columns: pull = get_pull_function(n) if n.nested_path[0] == ".": - if n.type == NESTED: + if n.jx_type == NESTED: continue es_query.stored_fields += [n.es_column] @@ -161,14 +160,14 @@ def es_deepop(es, query): else: expr = s.value for v in expr.vars(): - for c in schema[v]: + for c in schema[v.var]: if c.nested_path[0] == ".": es_query.stored_fields += [c.es_column] # else: # Log.error("deep field not expected") pull_name = EXPRESSION_PREFIX + s.name - map_to_local = {untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT} + map_to_local = MapToLocal(schema) pull = jx_expression_to_function(pull_name) post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python()) @@ -223,3 +222,23 @@ def es_deepop(es, query): Log.error("problem formatting", e) +class MapToLocal(object): + """ + MAP FROM RELATIVE/ABSOLUTE NAMESPACE TO PYTHON THAT WILL EXTRACT RESULT + """ + def __init__(self, map_to_columns): + self.map_to_columns = map_to_columns + + def __getitem__(self, item): + return self.get(item) + + def get(self, item): + cs = self.map_to_columns[item] + if len(cs) == 0: + return "Null" + elif len(cs) == 1: + return get_pull(cs[0]) + else: + return "coalesce(" + (",".join(get_pull(c) for c in cs)) + ")" + + diff --git a/vendor/jx_elasticsearch/es52/expressions.py b/vendor/jx_elasticsearch/es52/expressions.py index 09bfe10..dbea756 100644 --- a/vendor/jx_elasticsearch/es52/expressions.py +++ b/vendor/jx_elasticsearch/es52/expressions.py @@ -13,15 +13,15 @@ from __future__ import unicode_literals import itertools -from mo_future import text_type - from jx_base import NUMBER, STRING, BOOLEAN, OBJECT, INTEGER from jx_base.expressions import Variable, TupleOp, LeavesOp, BinaryOp, OrOp, ScriptOp, \ WhenOp, InequalityOp, extend, Literal, NullOp, TrueOp, FalseOp, DivOp, FloorOp, \ EqOp, NeOp, NotOp, LengthOp, NumberOp, StringOp, CountOp, MultiOp, RegExpOp, CoalesceOp, MissingOp, ExistsOp, \ PrefixOp, NotLeftOp, InOp, CaseOp, AndOp, \ - ConcatOp, IsNumberOp, Expression, BasicIndexOfOp, MaxOp, MinOp, BasicEqOp, BooleanOp, IntegerOp, BasicSubstringOp, ZERO, NULL, FirstOp, FALSE, TRUE, SuffixOp, simplified -from mo_dots import coalesce, wrap, Null, unwraplist, set_default, literal_field + ConcatOp, IsNumberOp, Expression, BasicIndexOfOp, MaxOp, MinOp, BasicEqOp, BooleanOp, IntegerOp, BasicSubstringOp, ZERO, NULL, FirstOp, FALSE, TRUE, SuffixOp, simplified, ONE +from jx_elasticsearch.es52.util import es_not, es_script, es_or, es_and, es_missing +from mo_dots import coalesce, wrap, Null, set_default, literal_field +from mo_future import text_type from mo_logs import Log, suppress_exception from mo_logs.strings import expand_template, quote from mo_math import MAX, OR @@ -55,8 +55,16 @@ return output.toString() +<<<<<<< .mine class Painless(Expression): __slots__ = ("miss", "data_type", "expr", "many") +||||||| .r1009 +class Painless(Expression): + __slots__ = ("miss", "type", "expr", "many") +======= +class EsScript(Expression): + __slots__ = ("miss", "data_type", "expr", "many") +>>>>>>> .r1128 def __init__(self, type, expr, frum, miss=None, many=False): self.miss = coalesce(miss, FALSE) # Expression that will return true/false to indicate missing result @@ -77,16 +85,16 @@ class Painless(Expression): """ missing = self.miss.partial_eval() if missing is FALSE: - return self.partial_eval().to_painless(schema).expr + return self.partial_eval().to_es_script(schema).expr elif missing is TRUE: return "null" - return "(" + missing.to_painless(schema).expr + ")?null:(" + self.expr + ")" + return "(" + missing.to_es_script(schema).expr + ")?null:(" + self.expr + ")" def to_esfilter(self, schema): - return {"script": {"script": {"lang": "painless", "inline": self.script(schema)}}} + return {"script": es_script(self.script(schema))} - def to_painless(self, schema): + def to_es_script(self, schema): return self def missing(self): @@ -96,7 +104,7 @@ class Painless(Expression): return {"script": self.script} def __eq__(self, other): - if not isinstance(other, Painless): + if not isinstance(other, EsScript): return False elif self.expr==other.expr: return True @@ -105,9 +113,9 @@ class Painless(Expression): @extend(BinaryOp) -def to_painless(self, schema): - lhs = NumberOp("number", self.lhs).partial_eval().to_painless(schema).expr - rhs = NumberOp("number", self.rhs).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + lhs = NumberOp("number", self.lhs).partial_eval().to_es_script(schema).expr + rhs = NumberOp("number", self.rhs).partial_eval().to_es_script(schema).expr script = "(" + lhs + ") " + BinaryOp.operators[self.op] + " (" + rhs + ")" missing = OrOp("or", [self.lhs.missing(), self.rhs.missing()]) @@ -117,20 +125,20 @@ def to_painless(self, schema): **{ "then": self.default, "else": - Painless(type=NUMBER, expr=script, frum=self) + EsScript(type=NUMBER, expr=script, frum=self) } - ).partial_eval().to_painless(schema) + ).partial_eval().to_es_script(schema) @extend(BinaryOp) def to_esfilter(self, schema): if not isinstance(self.lhs, Variable) or not isinstance(self.rhs, Literal) or self.op in BinaryOp.operators: - return self.to_painless(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) if self.op in ["eq", "term"]: return {"term": {self.lhs.var: self.rhs.to_esfilter(schema)}} elif self.op in ["ne", "neq"]: - return {"bool": {"must_not": {"term": {self.lhs.var: self.rhs.to_esfilter(schema)}}}} + return es_not({"term": {self.lhs.var: self.rhs.to_esfilter(schema)}}) elif self.op in BinaryOp.ineq_ops: return {"range": {self.lhs.var: {self.op: self.rhs.value}}} else: @@ -138,14 +146,14 @@ def to_esfilter(self, schema): @extend(CaseOp) -def to_painless(self, schema): - acc = self.whens[-1].partial_eval().to_painless(schema) +def to_es_script(self, schema): + acc = self.whens[-1].partial_eval().to_es_script(schema) for w in reversed(self.whens[0:-1]): acc = WhenOp( "when", w.when, **{"then": w.then, "else": acc} - ).partial_eval().to_painless(schema) + ).partial_eval().to_es_script(schema) return acc @@ -162,7 +170,7 @@ def to_esfilter(self, schema): ).partial_eval().to_esfilter(schema) else: Log.error("do not know how to handle") - return ScriptOp("script", self.to_ruby(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ConcatOp) @@ -170,89 +178,89 @@ def to_esfilter(self, schema): if isinstance(self.value, Variable) and isinstance(self.find, Literal): return {"regexp": {self.value.var: ".*" + string2regexp(self.find.value) + ".*"}} else: - return ScriptOp("script", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ConcatOp) -def to_painless(self, schema): +def to_es_script(self, schema): if len(self.terms) == 0: - return self.default.to_painless(schema) + return self.default.to_es_script(schema) acc = [] separator = StringOp("string", self.separator).partial_eval() - sep = separator.to_painless(schema).expr + sep = separator.to_es_script(schema).expr for t in self.terms: val = WhenOp( "when", t.missing(), **{ "then": Literal("literal", ""), - "else": Painless(type=STRING, expr=sep + "+" + StringOp(None, t).partial_eval().to_painless(schema).expr, frum=t) + "else": EsScript(type=STRING, expr=sep + "+" + StringOp(None, t).partial_eval().to_es_script(schema).expr, frum=t) # "else": ConcatOp("concat", [sep, t]) } ) - acc.append("(" + val.partial_eval().to_painless(schema).expr + ")") - expr_ = "(" + "+".join(acc) + ").substring(" + LengthOp("length", separator).to_painless(schema).expr + ")" + acc.append("(" + val.partial_eval().to_es_script(schema).expr + ")") + expr_ = "(" + "+".join(acc) + ").substring(" + LengthOp("length", separator).to_es_script(schema).expr + ")" if isinstance(self.default, NullOp): - return Painless( + return EsScript( miss=self.missing(), type=STRING, expr=expr_, frum=self ) else: - return Painless( + return EsScript( miss=self.missing(), type=STRING, - expr="((" + expr_ + ").length==0) ? (" + self.default.to_painless(schema).expr + ") : (" + expr_ + ")", + expr="((" + expr_ + ").length==0) ? (" + self.default.to_es_script(schema).expr + ") : (" + expr_ + ")", frum=self ) @extend(Literal) -def to_painless(self, schema): +def to_es_script(self, schema): def _convert(v): if v is None: - return NULL.to_painless(schema) + return NULL.to_es_script(schema) if v is True: - return Painless( + return EsScript( type=BOOLEAN, expr="true", frum=self ) if v is False: - return Painless( + return EsScript( type=BOOLEAN, expr="false", frum=self ) if isinstance(v, text_type): - return Painless( + return EsScript( type=STRING, expr=quote(v), frum=self ) if isinstance(v, int): - return Painless( + return EsScript( type=INTEGER, expr=text_type(v), frum=self ) if isinstance(v, float): - return Painless( + return EsScript( type=NUMBER, expr=text_type(v), frum=self ) if isinstance(v, dict): - return Painless( + return EsScript( type=OBJECT, expr="[" + ", ".join(quote(k) + ": " + _convert(vv) for k, vv in v.items()) + "]", frum=self ) if isinstance(v, (list, tuple)): - return Painless( + return EsScript( type=OBJECT, expr="[" + ", ".join(_convert(vv).expr for vv in v) + "]", frum=self @@ -262,16 +270,16 @@ def to_painless(self, schema): @extend(CoalesceOp) -def to_painless(self, schema): +def to_es_script(self, schema): if not self.terms: - return NULL.to_painless(schema) + return NULL.to_es_script(schema) v = self.terms[-1] - acc = FirstOp("first", v).partial_eval().to_painless(schema) + acc = FirstOp("first", v).partial_eval().to_es_script(schema) for v in reversed(self.terms[:-1]): m = v.missing().partial_eval() - e = NotOp("not", m).partial_eval().to_painless(schema) - r = FirstOp("first", v).partial_eval().to_painless(schema) + e = NotOp("not", m).partial_eval().to_es_script(schema) + r = FirstOp("first", v).partial_eval().to_es_script(schema) if r.miss is TRUE: continue @@ -287,7 +295,7 @@ def to_painless(self, schema): else: new_type = OBJECT - acc = Painless( + acc = EsScript( miss=AndOp("and", [acc.miss, m]).partial_eval(), type=new_type, expr="(" + e.expr + ") ? (" + r.expr + ") : (" + acc.expr + ")", @@ -302,8 +310,8 @@ def to_esfilter(self, schema): @extend(ExistsOp) -def to_painless(self, schema): - return self.field.exists().partial_eval().to_painless(schema) +def to_es_script(self, schema): + return self.field.exists().partial_eval().to_es_script(schema) @extend(ExistsOp) @@ -317,8 +325,8 @@ def to_esfilter(self, schema): @extend(NullOp) -def to_painless(self, schema): - return Painless( +def to_es_script(self, schema): + return EsScript( miss=TRUE, type=OBJECT, expr="null", @@ -327,17 +335,17 @@ def to_painless(self, schema): @extend(NullOp) def to_esfilter(self, schema): - return {"bool": {"must_not": {"match_all": {}}}} + return es_not({"match_all": {}}) @extend(FalseOp) -def to_painless(self, schema): - return Painless(type=BOOLEAN, expr="false", frum=self) +def to_es_script(self, schema): + return EsScript(type=BOOLEAN, expr="false", frum=self) @extend(FalseOp) def to_esfilter(self, schema): - return {"bool": {"must_not": {"match_all": {}}}} + return MATCH_NONE @extend(TupleOp) @@ -346,10 +354,10 @@ def to_esfilter(self, schema): @extend(TupleOp) -def to_painless(self, schema): - terms = [FirstOp("first", t).partial_eval().to_painless(schema) for t in self.terms] +def to_es_script(self, schema): + terms = [FirstOp("first", t).partial_eval().to_es_script(schema) for t in self.terms] expr = 'new Object[]{'+','.join(t.expr for t in terms)+'}' - return Painless( + return EsScript( type=OBJECT, expr=expr, miss=FALSE, @@ -359,7 +367,7 @@ def to_painless(self, schema): @extend(LeavesOp) -def to_painless(self, schema): +def to_es_script(self, schema): Log.error("not supported") @@ -369,9 +377,9 @@ def to_esfilter(self, schema): @extend(InequalityOp) -def to_painless(self, schema): - lhs = NumberOp("number", self.lhs).partial_eval().to_painless(schema).expr - rhs = NumberOp("number", self.rhs).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + lhs = NumberOp("number", self.lhs).partial_eval().to_es_script(schema).expr + rhs = NumberOp("number", self.rhs).partial_eval().to_es_script(schema).expr script = "(" + lhs + ") " + InequalityOp.operators[self.op] + " (" + rhs + ")" output = WhenOp( @@ -380,9 +388,9 @@ def to_painless(self, schema): **{ "then": FALSE, "else": - Painless(type=BOOLEAN, expr=script, frum=self) + EsScript(type=BOOLEAN, expr=script, frum=self) } - ).partial_eval().to_painless(schema) + ).partial_eval().to_es_script(schema) return output @@ -398,23 +406,26 @@ def to_esfilter(self, schema): Log.error("operator {{op|quote}} does not work on objects", op=self.op) return {"range": {lhs: {self.op: self.rhs.value}}} else: - return {"script": {"script": {"lang": "painless", "inline": self.to_painless(schema).script(schema)}}} + script = self.to_es_script(schema) + if script.miss is not FALSE: + Log.error("inequality must be decisive") + return {"script": es_script(script.expr)} @extend(DivOp) -def to_painless(self, schema): +def to_es_script(self, schema): lhs = NumberOp("number", self.lhs).partial_eval() rhs = NumberOp("number", self.rhs).partial_eval() - script = "(" + lhs.to_painless(schema).expr + ") / (" + rhs.to_painless(schema).expr + ")" + script = "(" + lhs.to_es_script(schema).expr + ") / (" + rhs.to_es_script(schema).expr + ")" output = WhenOp( "when", OrOp("or", [self.lhs.missing(), self.rhs.missing(), EqOp("eq", [self.rhs, ZERO])]), **{ "then": self.default, - "else": Painless(type=NUMBER, expr=script, frum=self) + "else": EsScript(type=NUMBER, expr=script, frum=self) } - ).partial_eval().to_painless(schema) + ).partial_eval().to_es_script(schema) return output @@ -425,20 +436,29 @@ def to_esfilter(self, schema): @extend(FloorOp) -def to_painless(self, schema): - lhs = self.lhs.to_painless(schema) - rhs = self.rhs.to_painless(schema) - script = "(int)Math.floor(((double)(" + lhs + ") / (double)(" + rhs + ")).doubleValue())*(" + rhs + ")" +def to_es_script(self, schema): + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) + + if rhs.frum is ONE: + script = "(int)Math.floor(" + lhs.expr + ")" + else: + script = "Math.floor((" + lhs.expr + ") / (" + rhs.expr + "))*(" + rhs.expr + ")" output = WhenOp( "when", - OrOp("or", [self.lhs.missing(), self.rhs.missing(), EqOp("eq", [self.rhs, ZERO])]), + OrOp("or", [lhs.miss, rhs.miss, EqOp("eq", [self.rhs, ZERO])]), **{ "then": self.default, "else": - ScriptOp("script", script) + EsScript( + type=NUMBER, + expr=script, + frum=self, + miss=FALSE + ) } - ).to_painless(schema) + ).to_es_script(schema) return output @@ -447,7 +467,6 @@ def to_esfilter(self, schema): Log.error("Logic error") - @simplified @extend(EqOp) def partial_eval(self): @@ -457,12 +476,12 @@ def partial_eval(self): @extend(EqOp) -def to_painless(self, schema): +def to_es_script(self, schema): return CaseOp("case", [ WhenOp("when", self.lhs.missing(), **{"then": self.rhs.missing()}), WhenOp("when", self.rhs.missing(), **{"then": FALSE}), BasicEqOp("eq", [self.lhs, self.rhs]) - ]).partial_eval().to_painless(schema) + ]).partial_eval().to_es_script(schema) @extend(EqOp) @@ -491,26 +510,26 @@ def to_esfilter(self, schema): @extend(BasicEqOp) -def to_painless(self, schema): - lhs = self.lhs.partial_eval().to_painless(schema) - rhs = self.rhs.partial_eval().to_painless(schema) +def to_es_script(self, schema): + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) if lhs.many: if rhs.many: return AndOp("and", [ - Painless(type=BOOLEAN, expr="(" + lhs.expr + ").size()==(" + rhs.expr + ").size()", frum=self), - Painless(type=BOOLEAN, expr="(" + rhs.expr + ").containsAll(" + lhs.expr + ")", frum=self) - ]).to_painless(schema) + EsScript(type=BOOLEAN, expr="(" + lhs.expr + ").size()==(" + rhs.expr + ").size()", frum=self), + EsScript(type=BOOLEAN, expr="(" + rhs.expr + ").containsAll(" + lhs.expr + ")", frum=self) + ]).to_es_script(schema) else: - return Painless(type=BOOLEAN, expr="(" + lhs.expr + ").contains(" + rhs.expr + ")",frum=self) + return EsScript(type=BOOLEAN, expr="(" + lhs.expr + ").contains(" + rhs.expr + ")",frum=self) elif rhs.many: - return Painless( + return EsScript( type=BOOLEAN, expr="(" + rhs.expr + ").contains(" + lhs.expr + ")", frum=self ) else: - return Painless( + return EsScript( type=BOOLEAN, expr="(" + lhs.expr + "==" + rhs.expr + ")", frum=self @@ -533,32 +552,32 @@ def to_esfilter(self, schema): else: return {"term": {lhs: rhs}} else: - return self.to_painless(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) @extend(MissingOp) -def to_painless(self, schema, not_null=False, boolean=True): +def to_es_script(self, schema, not_null=False, boolean=True): if isinstance(self.expr, Variable): if self.expr.var == "_id": - return Painless(type=BOOLEAN, expr="false", frum=self) + return EsScript(type=BOOLEAN, expr="false", frum=self) else: columns = schema.leaves(self.expr.var) if len(columns) == 1: - return Painless(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].empty", frum=self) + return EsScript(type=BOOLEAN, expr="doc[" + quote(columns[0].es_column) + "].empty", frum=self) else: return AndOp("and", [ - Painless( + EsScript( type=BOOLEAN, expr="doc[" + quote(c.es_column) + "].empty", frum=self ) for c in columns - ]).partial_eval().to_painless(schema) + ]).partial_eval().to_es_script(schema) elif isinstance(self.expr, Literal): - return self.expr.missing().to_painless(schema) + return self.expr.missing().to_es_script(schema) else: - return self.expr.missing().to_painless(schema) + return self.expr.missing().partial_eval().to_es_script(schema) @extend(MissingOp) @@ -568,22 +587,22 @@ def to_esfilter(self, schema): if not cols: return {"match_all": {}} elif len(cols) == 1: - return {"bool": {"must_not": {"exists": {"field": cols[0].es_column}}}} + return es_missing(cols[0].es_column) else: - return {"bool": {"must": [ - {"bool": {"must_not": {"exists": {"field": c.es_column}}}} for c in cols] - }} + return es_and([ + es_missing(c.es_column) for c in cols + ]) else: - return ScriptOp("script", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(NotLeftOp) -def to_painless(self, schema): - v = StringOp("string", self.value).partial_eval().to_painless(schema).expr - l = NumberOp("number", self.length).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).partial_eval().to_es_script(schema).expr + l = NumberOp("number", self.length).partial_eval().to_es_script(schema).expr expr = "(" + v + ").substring((int)Math.max(0, (int)Math.min(" + v + ".length(), " + l + ")))" - return Painless( + return EsScript( miss=OrOp("or", [self.value.missing(), self.length.missing()]), type=STRING, expr=expr, @@ -592,12 +611,12 @@ def to_painless(self, schema): @extend(NeOp) -def to_painless(self, schema): +def to_es_script(self, schema): return CaseOp("case", [ WhenOp("when", self.lhs.missing(), **{"then": NotOp("not", self.rhs.missing())}), WhenOp("when", self.rhs.missing(), **{"then": NotOp("not", self.lhs.missing())}), NotOp("not", BasicEqOp("eq", [self.lhs, self.rhs])) - ]).partial_eval().to_painless(schema) + ]).partial_eval().to_es_script(schema) @extend(NeOp) @@ -607,16 +626,16 @@ def to_esfilter(self, schema): if len(columns) == 0: return {"match_all": {}} elif len(columns) == 1: - return {"bool": {"must_not": {"term": {columns[0].es_column: self.rhs.value}}}} + return es_not({"term": {columns[0].es_column: self.rhs.value}}) else: Log.error("column split to multiple, not handled") else: - lhs = self.lhs.partial_eval().to_painless(schema) - rhs = self.rhs.partial_eval().to_painless(schema) + lhs = self.lhs.partial_eval().to_es_script(schema) + rhs = self.rhs.partial_eval().to_es_script(schema) if lhs.many: if rhs.many: - return wrap({"bool": {"must_not": + return es_not( ScriptOp( "script", ( @@ -624,26 +643,26 @@ def to_esfilter(self, schema): "(" + rhs.expr + ").containsAll(" + lhs.expr + ")" ) ).to_esfilter(schema) - }}) + ) else: - return wrap({"bool": {"must_not": + return es_not( ScriptOp("script", "(" + lhs.expr + ").contains(" + rhs.expr + ")").to_esfilter(schema) - }}) + ) else: if rhs.many: - return wrap({"bool": {"must_not": + return es_not( ScriptOp("script", "(" + rhs.expr + ").contains(" + lhs.expr + ")").to_esfilter(schema) - }}) + ) else: - return wrap({"bool": {"must": + return es_not( ScriptOp("script", "(" + lhs.expr + ") != (" + rhs.expr + ")").to_esfilter(schema) - }}) + ) @extend(NotOp) -def to_painless(self, schema): - return Painless( +def to_es_script(self, schema): + return EsScript( type=BOOLEAN, - expr="!(" + self.term.to_painless(schema).expr + ")", + expr="!(" + self.term.to_es_script(schema).expr + ")", frum=self ) @@ -658,18 +677,18 @@ def to_esfilter(self, schema): return {"exists": {"field": v}} else: operand = self.term.to_esfilter(schema) - return {"bool": {"must_not": operand}} + return es_not(operand) @extend(AndOp) -def to_painless(self, schema): +def to_es_script(self, schema): if not self.terms: - return TRUE.to_painless() + return TRUE.to_es_script() else: - return Painless( + return EsScript( miss=FALSE, type=BOOLEAN, - expr=" && ".join("(" + t.to_painless(schema).expr + ")" for t in self.terms), + expr=" && ".join("(" + t.to_es_script(schema).expr + ")" for t in self.terms), frum=self ) @@ -679,29 +698,40 @@ def to_esfilter(self, schema): if not len(self.terms): return {"match_all": {}} else: - return {"bool": {"must": [t.to_esfilter(schema) for t in self.terms]}} + return es_and([t.to_esfilter(schema) for t in self.terms]) @extend(OrOp) -def to_painless(self, schema): - return Painless( +def to_es_script(self, schema): + return EsScript( miss=FALSE, type=BOOLEAN, - expr=" || ".join("(" + t.to_painless(schema).expr + ")" for t in self.terms if t), + expr=" || ".join("(" + t.to_es_script(schema).expr + ")" for t in self.terms if t), frum=self ) @extend(OrOp) def to_esfilter(self, schema): - return {"bool": {"should": [t.to_esfilter(schema) for t in self.terms]}} + return es_or([t.partial_eval().to_esfilter(schema) for t in self.terms]) + + # OR(x) == NOT(AND(NOT(xi) for xi in x)) + # output = es_not(es_and([ + # NotOp("not", t).partial_eval().to_esfilter(schema) + # for t in self.terms + # ])) + # return output + + # WE REQUIRE EXIT-EARLY SEMANTICS, OTHERWISE EVERY EXPRESSION IS A SCRIPT EXPRESSION + # {"bool":{"should" :[a, b, c]}} RUNS IN PARALLEL + # {"bool":{"must_not":[a, b, c]}} ALSO RUNS IN PARALLEL @extend(LengthOp) -def to_painless(self, schema): - value = StringOp("string", self.term).to_painless(schema) +def to_es_script(self, schema): + value = StringOp("string", self.term).to_es_script(schema) missing = self.term.missing().partial_eval() - return Painless( + return EsScript( miss=missing, type=INTEGER, expr="(" + value.expr + ").length()", @@ -710,70 +740,65 @@ def to_painless(self, schema): @extend(FirstOp) -def to_painless(self, schema): +def to_es_script(self, schema): if isinstance(self.term, Variable): columns = schema.values(self.term.var) if len(columns) == 1: - return Painless( - miss=MissingOp("missing", self.term), - type=self.term.type, - expr="doc[" + quote(columns[0].es_column) + "].value", - frum=self - ) + return self.term.to_es_script(schema, many=False) - term = self.term.to_painless(schema) + term = self.term.to_es_script(schema) if isinstance(term.frum, CoalesceOp): - return CoalesceOp("coalesce", [FirstOp("first", t.partial_eval().to_painless(schema)) for t in term.frum.terms]).to_painless(schema) + return CoalesceOp("coalesce", [FirstOp("first", t.partial_eval().to_es_script(schema)) for t in term.frum.terms]).to_es_script(schema) if term.many: - return Painless( + return EsScript( miss=term.miss, type=term.type, expr="(" + term.expr + ")[0]", frum=term.frum - ).to_painless(schema) + ).to_es_script(schema) else: return term @extend(BooleanOp) -def to_painless(self, schema): - value = self.term.to_painless(schema) +def to_es_script(self, schema): + value = self.term.to_es_script(schema) if value.many: - return BooleanOp("boolean", Painless( + return BooleanOp("boolean", EsScript( miss=value.miss, type=value.type, expr="(" + value.expr + ")[0]", frum=value.frum - )).to_painless(schema) + )).to_es_script(schema) elif value.type == BOOLEAN: miss = value.miss value.miss = FALSE - return WhenOp("when", miss, **{"then": FALSE, "else": value}).partial_eval().to_painless(schema) + return WhenOp("when", miss, **{"then": FALSE, "else": value}).partial_eval().to_es_script(schema) else: - return NotOp("not", value.miss).partial_eval().to_painless(schema) + return NotOp("not", value.miss).partial_eval().to_es_script(schema) @extend(BooleanOp) def to_esfilter(self, schema): if isinstance(self.term, Variable): return {"term": {self.term.var: True}} else: - return self.to_painless(schema).to_esfilter(schema) + return self.to_es_script(schema).to_esfilter(schema) @extend(IntegerOp) -def to_painless(self, schema): - value = self.term.to_painless(schema) +def to_es_script(self, schema): + value = self.term.to_es_script(schema) if value.many: - return IntegerOp("integer", Painless( + return IntegerOp("integer", EsScript( miss=value.missing, type=value.type, expr="(" + value.expr + ")[0]", frum=value.frum - )).to_painless(schema) + )).to_es_script(schema) elif value.type == BOOLEAN: - return Painless( + return EsScript( miss=value.missing, type=INTEGER, expr=value.expr + " ? 1 : 0", @@ -782,21 +807,21 @@ def to_painless(self, schema): elif value.type == INTEGER: return value elif value.type == NUMBER: - return Painless( + return EsScript( miss=value.missing, type=INTEGER, expr="(int)(" + value.expr + ")", frum=self ) elif value.type == STRING: - return Painless( + return EsScript( miss=value.missing, type=INTEGER, expr="Integer.parseInt(" + value.expr + ")", frum=self ) else: - return Painless( + return EsScript( miss=value.missing, type=INTEGER, expr="((" + value.expr + ") instanceof String) ? Integer.parseInt(" + value.expr + ") : (int)(" + value.expr + ")", @@ -804,43 +829,43 @@ def to_painless(self, schema): ) @extend(NumberOp) -def to_painless(self, schema): +def to_es_script(self, schema): term = FirstOp("first", self.term).partial_eval() - value = term.to_painless(schema) + value = term.to_es_script(schema) if isinstance(value.frum, CoalesceOp): - return CoalesceOp("coalesce", [NumberOp("number", t).partial_eval().to_painless(schema) for t in value.frum.terms]).to_painless(schema) + return CoalesceOp("coalesce", [NumberOp("number", t).partial_eval().to_es_script(schema) for t in value.frum.terms]).to_es_script(schema) if value.type == BOOLEAN: - return Painless( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr + " ? 1 : 0", frum=self ) elif value.type == INTEGER: - return Painless( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr, frum=self ) elif value.type == NUMBER: - return Painless( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr=value.expr, frum=self ) elif value.type == STRING: - return Painless( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr="Double.parseDouble(" + value.expr + ")", frum=self ) elif value.type == OBJECT: - return Painless( + return EsScript( miss=term.missing().partial_eval(), type=NUMBER, expr="((" + value.expr + ") instanceof String) ? Double.parseDouble(" + value.expr + ") : (" + value.expr + ")", @@ -849,12 +874,12 @@ def to_painless(self, schema): @extend(IsNumberOp) -def to_painless(self, schema): - value = self.term.to_painless(schema) +def to_es_script(self, schema): + value = self.term.to_es_script(schema) if value.expr or value.i: - return TRUE.to_painless(schema) + return TRUE.to_es_script(schema) else: - return Painless( + return EsScript( miss=FALSE, type=BOOLEAN, expr="(" + value.expr + ") instanceof java.lang.Double", @@ -862,11 +887,11 @@ def to_painless(self, schema): ) @extend(CountOp) -def to_painless(self, schema): - return Painless( +def to_es_script(self, schema): + return EsScript( miss=FALSE, type=INTEGER, - expr="+".join("((" + t.missing().partial_eval().to_painless(schema).expr + ") ? 0 : 1)" for t in self.terms), + expr="+".join("((" + t.missing().partial_eval().to_es_script(schema).expr + ") ? 0 : 1)" for t in self.terms), frum=self ) @@ -877,11 +902,11 @@ def to_esfilter(self, schema): @extend(MaxOp) -def to_painless(self, schema): - acc = NumberOp("number", self.terms[-1]).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + acc = NumberOp("number", self.terms[-1]).partial_eval().to_es_script(schema).expr for t in reversed(self.terms[0:-1]): - acc = "Math.max(" + NumberOp("number", t).partial_eval().to_painless(schema).expr + " , " + acc + ")" - return Painless( + acc = "Math.max(" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + " , " + acc + ")" + return EsScript( miss=AndOp("or", [t.missing() for t in self.terms]), type=NUMBER, expr=acc, @@ -890,11 +915,11 @@ def to_painless(self, schema): @extend(MinOp) -def to_painless(self, schema): - acc = NumberOp("number", self.terms[-1]).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + acc = NumberOp("number", self.terms[-1]).partial_eval().to_es_script(schema).expr for t in reversed(self.terms[0:-1]): - acc = "Math.min(" + NumberOp("number", t).partial_eval().to_painless(schema).expr + " , " + acc + ")" - return Painless( + acc = "Math.min(" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + " , " + acc + ")" + return EsScript( miss=AndOp("or", [t.missing() for t in self.terms]), type=NUMBER, expr=acc, @@ -912,28 +937,28 @@ _painless_operators = { @extend(MultiOp) -def to_painless(self, schema): +def to_es_script(self, schema): op, unit = _painless_operators[self.op] if self.nulls: calc = op.join( - "((" + t.missing().to_painless(schema).expr + ") ? " + unit + " : (" + NumberOp("number", t).partial_eval().to_painless(schema).expr + "))" + "((" + t.missing().to_es_script(schema).expr + ") ? " + unit + " : (" + NumberOp("number", t).partial_eval().to_es_script(schema).expr + "))" for t in self.terms ) return WhenOp( "when", AndOp("and", [t.missing() for t in self.terms]), - **{"then": self.default, "else": Painless(type=NUMBER, expr=calc, frum=self)} - ).partial_eval().to_painless(schema) + **{"then": self.default, "else": EsScript(type=NUMBER, expr=calc, frum=self)} + ).partial_eval().to_es_script(schema) else: calc = op.join( - "(" + NumberOp("number", t).to_painless(schema).expr + ")" + "(" + NumberOp("number", t).to_es_script(schema).expr + ")" for t in self.terms ) return WhenOp( "when", OrOp("or", [t.missing() for t in self.terms]), - **{"then": self.default, "else": Painless(type=NUMBER, expr=calc, frum=self)} - ).partial_eval().to_painless(schema) + **{"then": self.default, "else": EsScript(type=NUMBER, expr=calc, frum=self)} + ).partial_eval().to_es_script(schema) @extend(RegExpOp) @@ -941,7 +966,7 @@ def to_esfilter(self, schema): if isinstance(self.pattern, Literal) and isinstance(self.var, Variable): cols = schema.leaves(self.var.var) if len(cols) == 0: - return {"bool": {"must_not": {"match_all": {}}}} + return MATCH_NONE elif len(cols) == 1: return {"regexp": {cols[0].es_column: self.pattern.value}} else: @@ -951,29 +976,29 @@ def to_esfilter(self, schema): @extend(StringOp) -def to_painless(self, schema): +def to_es_script(self, schema): term = FirstOp("first", self.term).partial_eval() - value = term.to_painless(schema) + value = term.to_es_script(schema) if isinstance(value.frum, CoalesceOp): - return CoalesceOp("coalesce", [StringOp("string", t).partial_eval() for t in value.frum.terms]).to_painless(schema) + return CoalesceOp("coalesce", [StringOp("string", t).partial_eval() for t in value.frum.terms]).to_es_script(schema) if value.type == BOOLEAN: - return Painless( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=value.expr + ' ? "T" : "F"', frum=self ) elif value.type == INTEGER: - return Painless( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr="String.valueOf(" + value.expr + ")", frum=self ) elif value.type == NUMBER: - return Painless( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=expand_template(NUMBER_TO_STRING, {"expr":value.expr}), @@ -982,7 +1007,7 @@ def to_painless(self, schema): elif value.type == STRING: return value else: - return Painless( + return EsScript( miss=self.term.missing().partial_eval(), type=STRING, expr=expand_template(NUMBER_TO_STRING, {"expr":value.expr}), @@ -994,8 +1019,8 @@ def to_painless(self, schema): @extend(TrueOp) -def to_painless(self, schema): - return Painless(type=BOOLEAN, expr="true", frum=self) +def to_es_script(self, schema): + return EsScript(type=BOOLEAN, expr="true", frum=self) @extend(TrueOp) @@ -1004,11 +1029,11 @@ def to_esfilter(self, schema): @extend(PrefixOp) -def to_painless(self, schema): +def to_es_script(self, schema): if not self.field: return "true" else: - return "(" + self.field.to_painless(schema) + ").startsWith(" + self.prefix.to_painless(schema) + ")" + return "(" + self.field.to_es_script(schema) + ").startsWith(" + self.prefix.to_es_script(schema) + ")" @extend(PrefixOp) @@ -1019,14 +1044,14 @@ def to_esfilter(self, schema): var = schema.leaves(self.expr.var)[0].es_column return {"prefix": {var: self.prefix.value}} else: - return ScriptOp("script", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(SuffixOp) -def to_painless(self, schema): +def to_es_script(self, schema): if not self.suffix: return "true" else: - return "(" + self.expr.to_painless(schema) + ").endsWith(" + self.suffix.to_painless(schema) + ")" + return "(" + self.expr.to_es_script(schema) + ").endsWith(" + self.suffix.to_es_script(schema) + ")" @extend(SuffixOp) @@ -1037,14 +1062,14 @@ def to_esfilter(self, schema): var = schema.leaves(self.expr.var)[0].es_column return {"regexp": {var: ".*"+string2regexp(self.suffix.value)}} else: - return ScriptOp("script", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(InOp) -def to_painless(self, schema): - superset = self.superset.to_painless(schema) - value = self.value.to_painless(schema) - return Painless( +def to_es_script(self, schema): + superset = self.superset.to_es_script(schema) + value = self.value.to_es_script(schema) + return EsScript( type=BOOLEAN, expr="(" + superset.expr + ").contains(" + value.expr + ")", frum=self @@ -1060,26 +1085,26 @@ def to_esfilter(self, schema): var = cols[0].es_column return {"terms": {var: self.superset.value}} else: - return ScriptOp("script", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("script", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(ScriptOp) -def to_painless(self, schema): - return Painless(type=OBJECT, expr=self.script) +def to_es_script(self, schema): + return EsScript(type=self.data_type, expr=self.script, frum=self) @extend(ScriptOp) def to_esfilter(self, schema): - return {"script": {"script": {"lang": "painless", "inline": self.script}}} + return {"script": es_script(self.script)} @extend(Variable) -def to_painless(self, schema): +def to_es_script(self, schema, many=True): if self.var == ".": return "_source" else: if self.var == "_id": - return Painless(type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) + return EsScript(type=STRING, expr='doc["_uid"].value.substring(doc["_uid"].value.indexOf(\'#\')+1)', frum=self) columns = schema.values(self.var) acc = [] @@ -1087,42 +1112,51 @@ def to_painless(self, schema): varname = c.es_column frum = Variable(c.es_column) q = quote(varname) - acc.append(Painless( - miss=frum.missing(), - type=c.type, - expr="doc[" + q + "].values" if c.type!=BOOLEAN else "doc[" + q + "].value", - frum=frum, - many=True - )) + if many: + acc.append(EsScript( + miss=frum.missing(), + type=c.jx_type, + expr="doc[" + q + "].values" if c.jx_type != BOOLEAN else "doc[" + q + "].value", + frum=frum, + many=True + )) + else: + acc.append(EsScript( + miss=frum.missing(), + type=c.jx_type, + expr="doc[" + q + "].value" if c.jx_type != BOOLEAN else "doc[" + q + "].value", + frum=frum, + many=True + )) if len(acc) == 0: - return NULL.to_painless(schema) + return NULL.to_es_script(schema) elif len(acc) == 1: return acc[0] else: - return CoalesceOp("coalesce", acc).to_painless(schema) + return CoalesceOp("coalesce", acc).to_es_script(schema) @extend(WhenOp) -def to_painless(self, schema): +def to_es_script(self, schema): if self.simplified: - when = self.when.to_painless(schema) - then = self.then.to_painless(schema) - els_ = self.els_.to_painless(schema) + when = self.when.to_es_script(schema) + then = self.then.to_es_script(schema) + els_ = self.els_.to_es_script(schema) if when is TRUE: return then elif when is FALSE: return els_ elif then.miss is TRUE: - return Painless( + return EsScript( miss=self.missing(), type=els_.type, expr=els_.expr, frum=self ) elif els_.miss is TRUE: - return Painless( + return EsScript( miss=self.missing(), type=then.type, expr=then.expr, @@ -1130,14 +1164,14 @@ def to_painless(self, schema): ) elif then.type == els_.type: - return Painless( + return EsScript( miss=self.missing(), type=then.type, expr="(" + when.expr + ") ? (" + then.expr + ") : (" + els_.expr + ")", frum=self ) elif then.type in (INTEGER, NUMBER) and els_.type in (INTEGER, NUMBER): - return Painless( + return EsScript( miss=self.missing(), type=NUMBER, expr="(" + when.expr + ") ? (" + then.expr + ") : (" + els_.expr + ")", @@ -1146,7 +1180,7 @@ def to_painless(self, schema): else: Log.error("do not know how to handle") else: - return self.partial_eval().to_painless(schema) + return self.partial_eval().to_es_script(schema) @extend(WhenOp) @@ -1160,12 +1194,12 @@ def to_esfilter(self, schema): @extend(BasicIndexOfOp) -def to_painless(self, schema): - v = StringOp("string", self.value).to_painless(schema).expr - find = StringOp("string", self.find).to_painless(schema).expr - start = IntegerOp("integer", self.start).to_painless(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).to_es_script(schema).expr + find = StringOp("string", self.find).to_es_script(schema).expr + start = IntegerOp("integer", self.start).to_es_script(schema).expr - return Painless( + return EsScript( miss=FALSE, type=INTEGER, expr="(" + v + ").indexOf(" + find + ", " + start + ")", @@ -1175,16 +1209,16 @@ def to_painless(self, schema): @extend(BasicIndexOfOp) def to_esfilter(self, schema): - return ScriptOp("", self.to_painless(schema).script(schema)).to_esfilter(schema) + return ScriptOp("", self.to_es_script(schema).script(schema)).to_esfilter(schema) @extend(BasicSubstringOp) -def to_painless(self, schema): - v = StringOp("string", self.value).partial_eval().to_painless(schema).expr - start = IntegerOp("string", self.start).partial_eval().to_painless(schema).expr - end = IntegerOp("integer", self.end).partial_eval().to_painless(schema).expr +def to_es_script(self, schema): + v = StringOp("string", self.value).partial_eval().to_es_script(schema).expr + start = IntegerOp("string", self.start).partial_eval().to_es_script(schema).expr + end = IntegerOp("integer", self.end).partial_eval().to_es_script(schema).expr - return Painless( + return EsScript( miss=FALSE, type=STRING, expr="(" + v + ").substring(" + start + ", " + end + ")", @@ -1194,7 +1228,7 @@ def to_painless(self, schema): MATCH_ALL = wrap({"match_all": {}}) -MATCH_NONE = wrap({"bool": {"must_not": {"match_all": {}}}}) +MATCH_NONE = es_not({"match_all": {}}) def simplify_esfilter(esfilter): @@ -1222,8 +1256,8 @@ def _normalize(esfilter): while isDiff: isDiff = False - if esfilter.bool.must: - terms = esfilter.bool.must + if esfilter.bool.filter: + terms = esfilter.bool.filter for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 == i1: continue # SAME, IGNORE @@ -1264,10 +1298,10 @@ def _normalize(esfilter): continue if a == MATCH_NONE: return MATCH_NONE - if a.bool.must: + if a.bool.filter: isDiff = True a.isNormal = None - output.extend(a.bool.must) + output.extend(a.bool.filter) else: a.isNormal = None output.append(a) @@ -1278,7 +1312,7 @@ def _normalize(esfilter): esfilter = output[0] break elif isDiff: - esfilter = wrap({"bool": {"must": output}}) + esfilter = es_and(output) continue if esfilter.bool.should: @@ -1318,18 +1352,14 @@ def _normalize(esfilter): if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: - return { - "bool": {"should": [ - {"bool": {"must_not": {"exists": {"field": k}}}}, - {"terms": {k: rest}} - ]}, - "isNormal": True - } + output = es_or([ + es_missing(k), + {"terms": {k: rest}} + ]) else: - return { - "bool": {"must_not": {"exists": {"field": k}}}, - "isNormal": True - } + output = es_missing(k) + output.isNormal = True + return output else: esfilter.isNormal = True return esfilter @@ -1371,7 +1401,7 @@ def split_expression_by_depth(where, schema, output=None, var_to_depth=None): if not vars_: return Null # MAP VARIABLE NAMES TO HOW DEEP THEY ARE - var_to_depth = {v: max(len(c.nested_path) - 1, 0) for v in vars_ for c in schema[v]} + var_to_depth = {v.var: max(len(c.nested_path) - 1, 0) for v in vars_ for c in schema[v.var]} all_depths = set(var_to_depth.values()) # if -1 in all_depths: # Log.error( @@ -1382,7 +1412,7 @@ def split_expression_by_depth(where, schema, output=None, var_to_depth=None): all_depths = {0} output = wrap([[] for _ in range(MAX(all_depths) + 1)]) else: - all_depths = set(var_to_depth[v] for v in vars_) + all_depths = set(var_to_depth[v.var] for v in vars_) if len(all_depths) == 1: output[list(all_depths)[0]] += [where] @@ -1399,10 +1429,10 @@ def get_type(var_name): type_ = var_name.split(".$")[1:] if not type_: return "j" - return json_type_to_painless_type.get(type_[0], "j") + return json_type_to_es_script_type.get(type_[0], "j") -json_type_to_painless_type = { +json_type_to_es_script_type = { "string": "s", "boolean": "b", "number": "n" diff --git a/vendor/jx_elasticsearch/es52/format.py b/vendor/jx_elasticsearch/es52/format.py index 02b7415..b63fc88 100644 --- a/vendor/jx_elasticsearch/es52/format.py +++ b/vendor/jx_elasticsearch/es52/format.py @@ -11,18 +11,15 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from collections import Mapping - -from mo_dots import Data, set_default, wrap, split_field, coalesce -from mo_future import sort_using_key -from mo_logs import Log -from pyLibrary import convert - from jx_base.expressions import TupleOp from jx_elasticsearch.es52.aggs import count_dim, aggs_iterator, format_dispatch, drill from jx_python.containers.cube import Cube from mo_collections.matrix import Matrix +from mo_dots import Data, set_default, wrap, split_field, coalesce +from mo_future import sort_using_key +from mo_logs import Log from mo_logs.strings import quote +from pyLibrary import convert FunctionType = type(lambda: 1) @@ -191,6 +188,9 @@ def format_list_from_groupby(decoders, aggs, start, query, select): output[s.name] = s.pull(agg) yield output + for g in query.groupby: + g.put.name = coalesce(g.put.name, g.name) + output = Data( meta={"format": "list"}, data=list(data()) @@ -208,7 +208,7 @@ def format_list(decoders, aggs, start, query, select): if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS - for row, coord, agg in aggs_iterator(aggs, decoders): + for _, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION @@ -230,7 +230,7 @@ def format_list(decoders, aggs, start, query, select): output[s.name] = s.pull(agg) yield output else: - is_sent = Matrix(dims=dims, zeros=0) + for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 diff --git a/vendor/jx_elasticsearch/es52/setop.py b/vendor/jx_elasticsearch/es52/setop.py index c5960c6..8bf27ce 100644 --- a/vendor/jx_elasticsearch/es52/setop.py +++ b/vendor/jx_elasticsearch/es52/setop.py @@ -19,12 +19,11 @@ from jx_base.expressions import IDENTITY from jx_base.query import DEFAULT_LIMIT from jx_elasticsearch import post as es_post from jx_elasticsearch.es52.expressions import Variable, LeavesOp -from jx_elasticsearch.es52.util import jx_sort_to_es_sort, es_query_template +from jx_elasticsearch.es52.util import jx_sort_to_es_sort, es_query_template, es_and, es_or, es_not, es_script from jx_python.containers.cube import Cube from jx_python.expressions import jx_expression_to_function from mo_collections.matrix import Matrix -from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field -from mo_dots import listwrap +from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field, listwrap from mo_dots.lists import FlatList from mo_json.typed_encoder import untype_path, unnest_path, untyped from mo_logs import Log @@ -56,7 +55,7 @@ def is_setop(es, query): def es_setop(es, query): schema = query.frum.schema - es_query, filters = es_query_template(schema.query_path) + es_query, filters = es_query_template(schema.query_path[0]) nested_filter = None set_default(filters[0], query.where.partial_eval().to_esfilter(schema)) es_query.size = coalesce(query.limit, DEFAULT_LIMIT) @@ -75,10 +74,10 @@ def es_setop(es, query): # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable): term = select.value.term - leaves = schema.values(term.var) + leaves = schema.leaves(term.var) for c in leaves: full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var)) - if c.type == NESTED: + if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": full_name, @@ -88,7 +87,7 @@ def es_setop(es, query): }) put_index += 1 elif c.nested_path[0] != ".": - es_query.stored_fields = ["_source"] + pass # THE NESTED PARENT WILL CAPTURE THIS else: es_query.stored_fields += [c.es_column] new_select.append({ @@ -103,7 +102,7 @@ def es_setop(es, query): leaves = schema.leaves(s_column) nested_selects = {} if leaves: - if any(c.type == NESTED for c in leaves): + if s_column == '.' or any(c.jx_type == NESTED for c in leaves): # PULL WHOLE NESTED ARRAYS es_query.stored_fields = ["_source"] for c in leaves: @@ -120,7 +119,7 @@ def es_setop(es, query): for c in leaves: if len(c.nested_path) == 1: jx_name = untype_path(c.names["."]) - if c.type == NESTED: + if c.jx_type == NESTED: es_query.stored_fields = ["_source"] new_select.append({ "name": select.name, @@ -144,7 +143,7 @@ def es_setop(es, query): filters[0][k] = None set_default( filters[0], - {"bool": {"must": [where, {"bool": {"should": nested_filter}}]}} + es_and([where, es_or(nested_filter)]) ) nested_path = c.nested_path[0] @@ -156,7 +155,7 @@ def es_setop(es, query): where.nested.inner_hits._source = False where.nested.inner_hits.stored_fields += [c.es_column] - child = relative_field(untype_path(c.names[schema.query_path]), s_column) + child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column) pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path)))) new_select.append({ "name": select.name, @@ -169,7 +168,7 @@ def es_setop(es, query): "pull": pull }) else: - nested_selects[nested_path].nested.inner_hits.stored_fields+=[c.es_column] + nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column] else: new_select.append({ "name": select.name, @@ -178,11 +177,8 @@ def es_setop(es, query): }) put_index += 1 else: - painless = select.value.partial_eval().to_painless(schema) - es_query.script_fields[literal_field(select.name)] = {"script": { - "lang": "painless", - "inline": painless.script(schema) - }} + painless = select.value.partial_eval().to_es_script(schema) + es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema)) new_select.append({ "name": select.name, "pull": jx_expression_to_function("fields." + literal_field(select.name)), @@ -346,6 +342,7 @@ set_default(format_dispatch, { "list": (format_list, None, "application/json") }) + def get_pull(column): if column.nested_path[0] == ".": return concat_field("fields", literal_field(column.es_column)) diff --git a/vendor/jx_elasticsearch/es52/util.py b/vendor/jx_elasticsearch/es52/util.py index ef0ff59..d989968 100644 --- a/vendor/jx_elasticsearch/es52/util.py +++ b/vendor/jx_elasticsearch/es52/util.py @@ -11,6 +11,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals +from mo_future import text_type + +from mo_logs import Log + from jx_base import STRING, BOOLEAN, NUMBER, OBJECT from jx_elasticsearch.es52.expressions import Variable from mo_dots import wrap @@ -23,18 +27,21 @@ def es_query_template(path): :return: """ + if not isinstance(path, text_type): + Log.error("expecting path to be a string") + if path != ".": f0 = {} f1 = {} output = wrap({ - "query": {"bool": {"must": [ + "query": es_and([ f0, {"nested": { "path": path, "query": f1, "inner_hits": {"size": 100000} }} - ]}}, + ]), "from": 0, "size": 0, "sort": [] @@ -43,7 +50,7 @@ def es_query_template(path): else: f0 = {} output = wrap({ - "query": {"bool": {"must": [f0]}}, + "query": es_and([f0]), "from": 0, "size": 0, "sort": [] @@ -66,7 +73,7 @@ def jx_sort_to_es_sort(sort, schema): for type in types: for c in cols: - if c.type == type: + if c.jx_type == type: if s.sort == -1: output.append({c.es_column: "desc"}) else: @@ -109,3 +116,22 @@ aggregates = { NON_STATISTICAL_AGGS = {"none", "one"} + +def es_and(terms): + return wrap({"bool": {"filter": terms}}) + + +def es_or(terms): + return wrap({"bool": {"should": terms}}) + + +def es_not(term): + return wrap({"bool": {"must_not": term}}) + + +def es_script(term): + return wrap({"script": {"lang": "painless", "inline": term}}) + + +def es_missing(term): + return {"bool": {"must_not": {"exists": {"field": term}}}} diff --git a/vendor/jx_elasticsearch/meta.py b/vendor/jx_elasticsearch/meta.py index a80f087..597ddaf 100644 --- a/vendor/jx_elasticsearch/meta.py +++ b/vendor/jx_elasticsearch/meta.py @@ -12,24 +12,28 @@ from __future__ import division from __future__ import unicode_literals import itertools -from copy import copy from itertools import product -from jx_base import STRUCT, Table +import jx_base +from jx_base.namespace import Namespace +from mo_math import MAX + +from mo_collections.relation import Relation_usingList + +from jx_base import STRUCT, TableDesc, BOOLEAN from jx_base.query import QueryOp -from jx_base.schema import Schema from jx_python import jx, meta as jx_base_meta from jx_python.containers.list_usingPythonList import ListContainer from jx_python.meta import ColumnList, Column -from mo_dots import Data, relative_field, concat_field, SELF_PATH, ROOT_PATH, coalesce, set_default, Null, split_field, join_field, wrap -from mo_json.typed_encoder import EXISTS_TYPE +from mo_dots import Data, relative_field, SELF_PATH, ROOT_PATH, coalesce, set_default, Null, split_field, join_field, wrap, concat_field, startswith_field, literal_field +from mo_json.typed_encoder import EXISTS_TYPE, TYPE_PREFIX, untype_path, unnest_path from mo_kwargs import override from mo_logs import Log from mo_logs.strings import quote from mo_threads import Queue, THREAD_STOP, Thread, Till from mo_times import HOUR, MINUTE, Timer, Date from pyLibrary.env import elasticsearch -from pyLibrary.env.elasticsearch import es_type_to_json_type +from pyLibrary.env.elasticsearch import es_type_to_json_type, _get_best_type_from_mapping MAX_COLUMN_METADATA_AGE = 12 * HOUR ENABLE_META_SCAN = False @@ -39,9 +43,9 @@ OLD_METADATA = MINUTE TEST_TABLE_PREFIX = "testing" # USED TO TURN OFF COMPLAINING ABOUT TEST INDEXES -class FromESMetadata(Schema): +class ElasticsearchMetadata(Namespace): """ - QUERY THE METADATA + MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ def __new__(cls, *args, **kwargs): @@ -59,21 +63,31 @@ class FromESMetadata(Schema): self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) - self.default_es = elasticsearch.Cluster(kwargs=kwargs) + self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) + self.index_to_alias = Relation_usingList() + + self.es_metadata = Null - self.abs_columns = set() + # self.abs_columns = set() self.last_es_metadata = Date.now()-OLD_METADATA self.meta = Data() - table_columns = metadata_tables() - column_columns = metadata_columns() - self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() - self.meta.columns.insert(column_columns) - self.meta.columns.insert(table_columns) + + self.alias_to_query_paths = { + "meta.columns": [['.']], + "meta.tables": [['.']] + } + self.alias_new_since = { + "meta.columns": Date.now(), + "meta.tables": Date.now() + } + table_columns = metadata_tables() + self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) + self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) @@ -81,79 +95,52 @@ class FromESMetadata(Schema): self.worker = Thread.run("refresh metadata", self.not_monitor) return - @property - def query_path(self): - return None - @property def url(self): - return self.default_es.path + "/" + self.default_name.replace(".", "/") + return self.es_cluster.path + "/" + self.default_name.replace(".", "/") - def get_table(self, table_name): - with self.meta.tables.locker: - return wrap([t for t in self.meta.tables.data if t.name == table_name]) + def _reload_columns(self, alias=None): + """ + :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) + :return: + """ + # FIND ALL INDEXES OF ALIAS + canonical_index = self.es_cluster.get_best_matching_index(alias).index + times = self.es_cluster.index_new_since - def _upsert_column(self, c): - # ASSUMING THE self.meta.columns.locker IS HAD - existing_columns = self.meta.columns.find(c.es_index, c.names["."]) - for canonical in existing_columns: - if canonical.type == c.type and canonical is not c: - set_default(c.names, canonical.names) - for key in Column.__slots__: - canonical[key] = c[key] - if DEBUG: - Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) - self.todo.add(canonical) - break - else: - self.meta.columns.add(c) - self.todo.add(c) + indexes = self.index_to_alias.get_domain(alias) + update_required = not (MAX(times[i] for i in indexes) < self.es_cluster.last_metadata) + metadata = self.es_cluster.get_metadata(force=update_required) - if ENABLE_META_SCAN: - if DEBUG: - Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) - # MARK meta.columns AS DIRTY TOO - cols = self.meta.columns.find("meta.columns", None) - for cc in cols: - cc.partitions = cc.cardinality = None - cc.last_updated = Date.now() - TOO_OLD - self.todo.extend(cols) + props = [ + # (index, type, properties) TRIPLE + (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) + for i, d in metadata.indices.items() + if i in indexes + for t, m in [_get_best_type_from_mapping(d.mappings)] + ] - def _get_columns(self, table=None): - # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE - table_path = split_field(table) - es_index = table_path[0] - meta = self.es_metadata.indices[es_index] - if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: - self.es_metadata = self.default_es.get_metadata(force=True) - meta = self.es_metadata.indices[es_index] + # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT + dirty = False + all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) + # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE + for (i1, t1, p1), (i2, t2, p2) in all_comparisions: + diff = elasticsearch.diff_schema(p2, p1) + if not self.settings.read_only: + for d in diff: + dirty = True + i1.add_property(*d) + meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] - for data_type, properties in meta.mappings.items(): - if data_type == "_default_": - continue - properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} - self._parse_properties(meta.index, properties, meta) + data_type, mapping = _get_best_type_from_mapping(meta.mappings) + mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} + self._parse_properties(alias, mapping, meta) - def _parse_properties(self, abs_index, properties, meta): - # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND - # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES - def add_column(c, query_path): - c.last_updated = Date.now() - TOO_OLD - if query_path[0] != ".": - c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) - - with self.meta.columns.locker: - for alias in meta.aliases: - c_ = copy(c) - c_.es_index = alias - self._upsert_column(c_) - self._upsert_column(c) - - abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) - self.abs_columns.update(abs_columns) + def _parse_properties(self, alias, mapping, meta): + abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH - query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] + query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] @@ -166,15 +153,17 @@ class FromESMetadata(Schema): b.insert(i, aa) break for q in query_paths: - q.append(".") - query_paths.append(SELF_PATH) + q.append(SELF_PATH) + query_paths.append(ROOT_PATH) + self.alias_to_query_paths[alias] = query_paths - # ADD RELATIVE COLUMNS + # ADD RELATIVE NAMES for abs_column in abs_columns: - abs_column = abs_column.__copy__() - abs_column.type = es_type_to_json_type[abs_column.type] + abs_column.last_updated = None + abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: - add_column(abs_column, query_path) + abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) + self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): @@ -191,43 +180,62 @@ class FromESMetadata(Schema): RETURN METADATA COLUMNS """ table_path = split_field(table_name) - es_index_name = table_path[0] - query_path = join_field(table_path[1:]) - table = self.get_table(es_index_name)[0] - abs_column_name = None if column_name == None else concat_field(query_path, column_name) + root_table_name = table_path[0] + + # FIND ALIAS + if root_table_name in self.alias_new_since: + alias = root_table_name + else: + alias = self.index_to_alias[root_table_name] + + if not alias: + self.es_cluster.get_metadata(force=True) + # ENSURE INDEX -> ALIAS IS IN A MAPPING FOR LATER + for a in self.es_cluster.get_aliases(): + self.alias_new_since[a.alias] = MAX([self.es_cluster.index_new_since[a.index], self.alias_new_since.get(a.alias)]) + self.index_to_alias[a.index] = coalesce(a.alias, a.index) + + if root_table_name in self.alias_new_since: + alias = root_table_name + else: + alias = self.index_to_alias[root_table_name] + + if not alias: + Log.error("{{table|quote}} does not exist", table=table_name) + + now = Date.now() + table = self.get_table(alias)[0] try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: - table = Table( - name=es_index_name, + table = TableDesc( + name=alias, url=None, query_path=['.'], timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) - self._get_columns(table=es_index_name) - elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: - table.timestamp = Date.now() - self._get_columns(table=es_index_name) + self._reload_columns(alias=alias) + elif force or table.timestamp < now - MAX_COLUMN_METADATA_AGE: + table.timestamp = now + self._reload_columns(alias=alias) - with self.meta.columns.locker: - columns = self.meta.columns.find(es_index_name, column_name) - if columns: - columns = jx.sort(columns, "names.\\.") - # AT LEAST WAIT FOR THE COLUMNS TO UPDATE - while len(self.todo) and not all(columns.get("last_updated")): - if DEBUG: + columns = self.meta.columns.find(alias, column_name) + columns = jx.sort(columns, "names.\.") + # AT LEAST WAIT FOR THE COLUMNS TO UPDATE + while len(self.todo) and not all(columns.get("last_updated")): + if DEBUG: + if len(columns) > 10: + Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) + else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) - Till(seconds=1).wait() - return columns + Till(seconds=1).wait() + return columns except Exception as e: Log.error("Not expected", cause=e) - if abs_column_name: - Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) - return [] def _update_cardinality(self, column): @@ -237,44 +245,42 @@ class FromESMetadata(Schema): if column.es_index in self.index_does_not_exist: return - if column.type in STRUCT: + if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": - with self.meta.columns.locker: - partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) - self.meta.columns.update({ - "set": { - "partitions": partitions, - "count": len(self.meta.columns), - "cardinality": len(partitions), - "multi": 1, - "last_updated": Date.now() - }, - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) + self.meta.columns.update({ + "set": { + "partitions": partitions, + "count": len(self.meta.columns), + "cardinality": len(partitions), + "multi": 1, + "last_updated": Date.now() + }, + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) return if column.es_index == "meta.tables": - with self.meta.columns.locker: - partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) - self.meta.columns.update({ - "set": { - "partitions": partitions, - "count": len(self.meta.tables), - "cardinality": len(partitions), - "multi": 1, - "last_updated": Date.now() - }, - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) + self.meta.columns.update({ + "set": { + "partitions": partitions, + "count": len(self.meta.tables), + "cardinality": len(partitions), + "multi": 1, + "last_updated": Date.now() + }, + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) return es_index = column.es_index.split(".")[0] - is_text = [cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text"] + is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED - result = self.default_es.post("/" + es_index + "/_search", data={ + result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, @@ -284,14 +290,24 @@ class FromESMetadata(Schema): cardinality = 1001 multi = 1001 elif column.es_column == "_id": - result = self.default_es.post("/" + es_index + "/_search", data={ + result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 + elif column.es_type == BOOLEAN: + result = self.es_cluster.post("/" + es_index + "/_search", data={ + "aggs": { + "count": _counting_query(column) + }, + "size": 0 + }) + count = result.hits.total + cardinality = 2 + multi = 1 else: - result = self.default_es.post("/" + es_index + "/_search", data={ + result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} @@ -308,47 +324,44 @@ class FromESMetadata(Schema): query = Data(size=0) if column.es_column == "_id": - with self.meta.columns.locker: - self.meta.columns.update({ - "set": { - "count": cardinality, - "cardinality": cardinality, - "multi": 1, - "last_updated": Date.now() - }, - "clear": ["partitions"], - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + self.meta.columns.update({ + "set": { + "count": cardinality, + "cardinality": cardinality, + "multi": 1, + "last_updated": Date.now() + }, + "clear": ["partitions"], + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) - with self.meta.columns.locker: - self.meta.columns.update({ - "set": { - "count": count, - "cardinality": cardinality, - "multi": multi, - "last_updated": Date.now() - }, - "clear": ["partitions"], - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + self.meta.columns.update({ + "set": { + "count": count, + "cardinality": cardinality, + "multi": multi, + "last_updated": Date.now() + }, + "clear": ["partitions"], + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) return - elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: + elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) - with self.meta.columns.locker: - self.meta.columns.update({ - "set": { - "count": count, - "cardinality": cardinality, - "multi": multi, - "last_updated": Date.now() - }, - "clear": ["partitions"], - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + self.meta.columns.update({ + "set": { + "count": count, + "cardinality": cardinality, + "multi": multi, + "last_updated": Date.now() + }, + "clear": ["partitions"], + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) return elif len(column.nested_path) != 1: query.aggs["_"] = { @@ -360,7 +373,7 @@ class FromESMetadata(Schema): else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} - result = self.default_es.post("/" + es_index + "/_search", data=query) + result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: @@ -368,19 +381,16 @@ class FromESMetadata(Schema): else: parts = jx.sort(aggs.buckets.key) - if DEBUG: - Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) - with self.meta.columns.locker: - self.meta.columns.update({ - "set": { - "count": count, - "cardinality": cardinality, - "multi": multi, - "partitions": parts, - "last_updated": Date.now() - }, - "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} - }) + self.meta.columns.update({ + "set": { + "count": count, + "cardinality": cardinality, + "multi": multi, + "partitions": parts, + "last_updated": Date.now() + }, + "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} + }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE @@ -389,11 +399,10 @@ class FromESMetadata(Schema): is_test_table = any(column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR - with self.meta.columns.locker: - self.meta.columns.update({ - "clear": ".", - "where": {"eq": {"es_index": column.es_index}} - }) + self.meta.columns.update({ + "clear": ".", + "where": {"eq": {"es_index": column.es_index}} + }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ @@ -415,42 +424,42 @@ class FromESMetadata(Schema): while not please_stop: try: if not self.todo: - with self.meta.columns.locker: - old_columns = [ - c - for c in self.meta.columns - if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT - ] - if old_columns: - if DEBUG: - Log.note( - "Old columns {{names|json}} last updated {{dates|json}}", - names=wrap(old_columns).es_column, - dates=[Date(t).format() for t in wrap(old_columns).last_updated] - ) - self.todo.extend(old_columns) - # TEST CONSISTENCY - for c, d in product(list(self.todo.queue), list(self.todo.queue)): - if c.es_column == d.es_column and c.es_index == d.es_index and c != d: - Log.error("") - else: - if DEBUG: - Log.note("no more metatdata to update") + old_columns = [ + c + for c in self.meta.columns + if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT + ] + if old_columns: + if DEBUG: + Log.note( + "Old columns {{names|json}} last updated {{dates|json}}", + names=wrap(old_columns).es_column, + dates=[Date(t).format() for t in wrap(old_columns).last_updated] + ) + self.todo.extend(old_columns) + # TEST CONSISTENCY + for c, d in product(list(self.todo.queue), list(self.todo.queue)): + if c.es_column == d.es_column and c.es_index == d.es_index and c != d: + Log.error("") + else: + if DEBUG: + Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) - if DEBUG: - Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column: - if column.es_index in self.index_does_not_exist: - with self.meta.columns.locker: - self.meta.columns.update({ - "clear": ".", - "where": {"eq": {"es_index": column.es_index}} - }) + if column is THREAD_STOP: continue - if column.type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): - with self.meta.columns.locker: - column.last_updated = Date.now() + + if DEBUG: + Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) + if column.es_index in self.index_does_not_exist: + self.meta.columns.update({ + "clear": ".", + "where": {"eq": {"es_index": column.es_index}} + }) + continue + if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): + column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue @@ -471,24 +480,159 @@ class FromESMetadata(Schema): if c == THREAD_STOP: break - if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: + if c.last_updated >= Date.now()-TOO_OLD: continue - with self.meta.columns.locker: - self.meta.columns.update({ - "set": { - "last_updated": Date.now() - }, - "clear":[ - "count", - "cardinality", - "multi", - "partitions", - ], - "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} - }) + self.meta.columns.update({ + "set": { + "last_updated": Date.now() + }, + "clear":[ + "count", + "cardinality", + "multi", + "partitions", + ], + "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} + }) if DEBUG: - Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c) + Log.note("Did not get {{col.es_index}}.{{col.es_column}} info", col=c) + + def get_table(self, alias_name): + with self.meta.tables.locker: + return wrap([t for t in self.meta.tables.data if t.name == alias_name]) + + def get_snowflake(self, fact_table_name): + return Snowflake(fact_table_name, self) + + def get_schema(self, name): + if name == "meta.columns": + return self.meta.columns.schema + query_path = split_field(name) + return self.get_snowflake(query_path[0]).get_schema(join_field(query_path[1:])) + + +class Snowflake(object): + """ + REPRESENT ONE ALIAS, AND ITS NESTED ARRAYS + """ + + def __init__(self, alias, namespace): + self.alias = alias + self.namespace = namespace + + + def get_schema(self, query_path): + return Schema(query_path, self) + + @property + def query_paths(self): + """ + RETURN A LIST OF ALL NESTED COLUMNS + """ + return self.namespace.alias_to_query_paths[self.alias] + + @property + def columns(self): + """ + RETURN ALL COLUMNS FROM ORIGIN OF FACT TABLE + """ + return self.namespace.get_columns(literal_field(self.alias)) + + +class Schema(jx_base.Schema): + """ + REPRESENT JUST ONE TABLE IN A SNOWFLAKE + """ + + def __init__(self, query_path, snowflake): + if not isinstance(snowflake.query_paths[0], list): + Log.error("Snowflake query paths should be a list of string tuples (well, technically, a list of lists of strings)") + self.query_path = [ + p + for p in snowflake.query_paths + if untype_path(p[0]) == query_path + ][0] + self.snowflake = snowflake + + def leaves(self, column_name): + """ + :param column_name: + :return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS + """ + column_name = unnest_path(column_name) + columns = self.columns + deep_path = self.query_path[0] + for path in self.query_path: + output = [ + c + for c in columns + if ( + (c.names['.'] != "_id" or column_name == "_id") and + c.jx_type not in OBJECTS and + startswith_field(unnest_path(c.names[path]), column_name) + ) + ] + if output: + return output + return [] + + def values(self, column_name): + """ + RETURN ALL COLUMNS THAT column_name REFERES TO + """ + column_name = unnest_path(column_name) + columns = self.columns + deep_path = self.query_path[0] + for path in self.query_path: + output = [ + c + for c in columns + if ( + c.jx_type not in STRUCT and + untype_path(c.names[path]) == column_name + ) + ] + if output: + return output + return output + + def __getitem__(self, column_name): + return self.values(column_name) + + @property + def name(self): + return concat_field(self.snowflake.alias, self.query_path[0]) + + @property + def columns(self): + return self.snowflake.namespace.get_columns(literal_field(self.snowflake.alias)) + + def map_to_es(self): + """ + RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME + """ + output = {} + for path in self.query_path: + set_default( + output, + { + k: c.es_column + for c in self.snowflake.columns + if c.jx_type not in STRUCT + for rel_name in [c.names[path]] + for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)] + } + ) + return output + + +class Table(jx_base.Table): + + def __init__(self, full_name, container): + jx_base.Table.__init__(self, full_name) + self.container=container + self.schema = container.namespace.get_schema(full_name) def _counting_query(c): @@ -502,7 +646,7 @@ def _counting_query(c): "aggs": { "_nested": {"cardinality": { "field": c.es_column, - "precision_threshold": 10 if c.type in elasticsearch.ES_NUMERIC_TYPES else 100 + "precision_threshold": 10 if c.es_type in elasticsearch.ES_NUMERIC_TYPES else 100 }} } } @@ -512,59 +656,6 @@ def _counting_query(c): }} -def metadata_columns(): - return wrap( - [ - Column( - names={".":c}, - es_index="meta.columns", - es_column=c, - type="string", - nested_path=ROOT_PATH - ) - for c in [ - "type", - "nested_path", - "es_column", - "es_index" - ] - ] + [ - Column( - es_index="meta.columns", - names={".":c}, - es_column=c, - type="object", - nested_path=ROOT_PATH - ) - for c in [ - "names", - "domain", - "partitions" - ] - ] + [ - Column( - names={".": c}, - es_index="meta.columns", - es_column=c, - type="long", - nested_path=ROOT_PATH - ) - for c in [ - "count", - "cardinality" - ] - ] + [ - Column( - names={".": "last_updated"}, - es_index="meta.columns", - es_column="last_updated", - type="time", - nested_path=ROOT_PATH - ) - ] - ) - - def metadata_tables(): return wrap( [ @@ -572,7 +663,7 @@ def metadata_tables(): names={".": c}, es_index="meta.tables", es_column=c, - type="string", + es_type="string", nested_path=ROOT_PATH ) for c in [ @@ -582,29 +673,17 @@ def metadata_tables(): ] ]+[ Column( - names={".": "timestamp"}, + names={".": c}, es_index="meta.tables", - es_column="timestamp", - type="integer", + es_column=c, + es_type="integer", nested_path=ROOT_PATH ) + for c in [ + "timestamp" + ] ] ) -def init_database(sql): - - - - sql.execute(""" - CREATE TABLE tables AS ( - table_name VARCHAR(200), - alias CHAR - - ) - - - """) - - - +OBJECTS = (jx_base.OBJECT, jx_base.EXISTS) diff --git a/vendor/jx_python/__init__.py b/vendor/jx_python/__init__.py index 7e78816..1a25337 100644 --- a/vendor/jx_python/__init__.py +++ b/vendor/jx_python/__init__.py @@ -39,18 +39,18 @@ def _delayed_imports(): MySQL = None try: - from jx_elasticsearch.meta import FromESMetadata + from jx_elasticsearch.meta import ElasticsearchMetadata except Exception: - FromESMetadata = None + ElasticsearchSnowflake = None set_default(container.type2container, { "mysql": MySQL, "memory": None, - "meta": FromESMetadata + "meta": ElasticsearchMetadata }) -def wrap_from(frum, schema=None): +def find_container(frum, schema=None): """ :param frum: :param schema: @@ -66,7 +66,6 @@ def wrap_from(frum, schema=None): Log.error("expecting jx_base.container.config.default.settings to contain default elasticsearch connection info") type_ = None - index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns.denormalized() @@ -74,13 +73,13 @@ def wrap_from(frum, schema=None): return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) - else: - type_ = container.config.default.type - index = split_field(frum)[0] + + type_ = container.config.default.type + fact_table_name = split_field(frum)[0] settings = set_default( { - "index": index, + "index": fact_table_name, "name": frum, "exists": True, }, @@ -95,7 +94,7 @@ def wrap_from(frum, schema=None): return container.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from jx_base.query import QueryOp - return QueryOp.wrap(frum, schema=schema) + return QueryOp.wrap(frum, namespace=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: diff --git a/vendor/jx_python/containers/list_usingPythonList.py b/vendor/jx_python/containers/list_usingPythonList.py index c51531e..11f9bfd 100644 --- a/vendor/jx_python/containers/list_usingPythonList.py +++ b/vendor/jx_python/containers/list_usingPythonList.py @@ -14,6 +14,15 @@ from __future__ import unicode_literals import itertools from collections import Mapping +from mo_math import UNION + +import jx_base +from jx_base import Container +from jx_base.expressions import jx_expression, Expression, Variable, TRUE +from jx_python.expression_compiler import compile_expression +from jx_python.expressions import jx_expression_to_function +from jx_python.lists.aggs import is_aggs, list_aggs +from jx_python.meta import get_schema_from_list from mo_collections import UniqueIndex from mo_dots import Data, wrap, listwrap, unwraplist, unwrap, Null from mo_future import sort_using_key @@ -21,21 +30,17 @@ from mo_logs import Log from mo_threads import Lock from pyLibrary import convert -from jx_base.expressions import jx_expression, Expression, TrueOp, Variable, TRUE -from jx_python.expressions import jx_expression_to_function -from jx_base.container import Container -from jx_python.expression_compiler import compile_expression -from jx_python.lists.aggs import is_aggs, list_aggs -from jx_python.meta import get_schema_from_list - _get = object.__getattribute__ -class ListContainer(Container): +class ListContainer(Container, jx_base.Namespace, jx_base.Table): + """ + A CONTAINER WITH ONLY ONE TABLE + """ def __init__(self, name, data, schema=None): # TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION data = list(unwrap(data)) - Container.__init__(self, data, schema) + Container.__init__(self) if schema == None: self._schema = get_schema_from_list(name, data) else: @@ -52,6 +57,10 @@ class ListContainer(Container): def schema(self): return self._schema + @property + def namespace(self): + return self + def last(self): """ :return: Last element in the list, or Null @@ -91,7 +100,7 @@ class ListContainer(Container): elif q.format == "table": head = [c.names['.'] for c in output.schema.columns] data = [ - [r[h] for h in head] + [r if h == '.' else r[h] for h in head] for r in output.data ] return Data(header=head, data=data, meta={"format": "table"}) @@ -170,6 +179,13 @@ class ListContainer(Container): new_schema = None if isinstance(select, list): + if all( + isinstance(s.value, Variable) and s.name == s.value.var + for s in select + ): + names = set(s.value.var for s in select) + new_schema = Schema(".", [c for c in self.schema.columns if c.names['.'] in names]) + push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects] def selector(d): output = Data() @@ -250,6 +266,23 @@ class ListContainer(Container): def __len__(self): return len(self.data) + # class Namespace(jx_base.Namespace): + + def get_snowflake(self, name): + if self.name != name: + Log.error("This container only has table by name of {{name}}", name=name) + return self + + def get_schema(self, name): + if self.name != name: + Log.error("This container only has table by name of {{name}}", name=name) + return self.schema + + def get_table(self, name): + if self.name != name: + Log.error("This container only has table by name of {{name}}", name=name) + return self + def _exec(code): try: @@ -261,6 +294,7 @@ def _exec(code): + from jx_base.schema import Schema from jx_python import jx diff --git a/vendor/jx_python/jx.py b/vendor/jx_python/jx.py index d1b9f52..a55e300 100644 --- a/vendor/jx_python/jx.py +++ b/vendor/jx_python/jx.py @@ -60,64 +60,64 @@ def get(expr): return jx_expression_to_function(expr) -def run(query, frum=Null): +def run(query, container=Null): """ THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER, BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer """ - if frum == None: - frum = wrap(query)['from'] - query_op = QueryOp.wrap(query, table=frum, schema=frum.schema) + if container == None: + container = wrap(query)['from'].container + query_op = QueryOp.wrap(query, container=container, namespace=container.schema) else: - query_op = QueryOp.wrap(query, frum, frum.schema) + query_op = QueryOp.wrap(query, container, container.namespace) - if frum == None: + if container == None: from jx_python.containers.list_usingPythonList import DUAL return DUAL.query(query_op) - elif isinstance(frum, Container): - return frum.query(query_op) - elif isinstance(frum, (list, set) + generator_types): - frum = wrap(list(frum)) - elif isinstance(frum, Cube): + elif isinstance(container, Container): + return container.query(query_op) + elif isinstance(container, (list, set) + generator_types): + container = wrap(list(container)) + elif isinstance(container, Cube): if is_aggs(query_op): - return cube_aggs(frum, query_op) - elif isinstance(frum, QueryOp): - frum = run(frum) + return cube_aggs(container, query_op) + elif isinstance(container, QueryOp): + container = run(container) else: - Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__) + Log.error("Do not know how to handle {{type}}", type=container.__class__.__name__) if is_aggs(query_op): - frum = list_aggs(frum, query_op) + container = list_aggs(container, query_op) else: # SETOP if query_op.where is not TRUE: - frum = filter(frum, query_op.where) + container = filter(container, query_op.where) if query_op.sort: - frum = sort(frum, query_op.sort, already_normalized=True) + container = sort(container, query_op.sort, already_normalized=True) if query_op.select: - frum = select(frum, query_op.select) + container = select(container, query_op.select) if query_op.window: - if isinstance(frum, Cube): - frum = list(frum.values()) + if isinstance(container, Cube): + container = list(container.values()) for param in query_op.window: - window(frum, param) + window(container, param) # AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT if query_op.format == "cube": - frum = convert.list2cube(frum) + container = convert.list2cube(container) elif query_op.format == "table": - frum = convert.list2table(frum) - frum.meta.format = "table" + container = convert.list2table(container) + container.meta.format = "table" else: - frum = wrap({ + container = wrap({ "meta": {"format": "list"}, - "data": frum + "data": container }) - return frum + return container groupby = group_by.groupby diff --git a/vendor/jx_python/meta.py b/vendor/jx_python/meta.py index 2ba8e71..cb80115 100644 --- a/vendor/jx_python/meta.py +++ b/vendor/jx_python/meta.py @@ -11,19 +11,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals +from collections import Mapping from datetime import date from datetime import datetime -from jx_base import STRUCT, Column -from jx_base.container import Container +import jx_base +from jx_base import python_type_to_json_type +from jx_base import STRUCT, Column, Table from jx_base.schema import Schema from jx_python import jx from mo_collections import UniqueIndex -from mo_dots import Data, concat_field, get_attr, listwrap, unwraplist, NullType, FlatList -from mo_dots import split_field, join_field, ROOT_PATH -from mo_dots import wrap -from mo_future import none_type -from mo_future import text_type, long, PY2 +from mo_dots import Data, concat_field, get_attr, listwrap, unwraplist, NullType, FlatList, set_default, split_field, join_field, ROOT_PATH, wrap +from mo_future import none_type, text_type, long, PY2 from mo_json.typed_encoder import untype_path, unnest_path from mo_logs import Log from mo_threads import Lock @@ -32,48 +31,101 @@ from mo_times.dates import Date singlton = None -class ColumnList(Container): +class ColumnList(Table): """ OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED """ def __init__(self): + Table.__init__(self, "meta.columns") self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS) self.locker = Lock() - self.count = 0 - self.meta_schema = None + self._schema = None + self.extend(METADATA_COLUMNS) def find(self, es_index, abs_column_name): - if "." in es_index and not es_index.startswith("meta."): - Log.error("unlikely index name") - if not abs_column_name: - return [c for cs in self.data.get(es_index, {}).values() for c in cs] - else: - return self.data.get(es_index, {}).get(abs_column_name, []) + with self.locker: + if es_index.startswith("meta."): + self._update_meta() - def insert(self, columns): - for column in columns: - self.add(column) + if not abs_column_name: + return [c for cs in self.data.get(es_index, {}).values() for c in cs] + else: + return self.data.get(es_index, {}).get(abs_column_name, []) + + def extend(self, columns): + self.dirty = True + with self.locker: + for column in columns: + self._add(column) def add(self, column): + self.dirty = True + with self.locker: + return self._add(column) + + def _add(self, column): columns_for_table = self.data.setdefault(column.es_index, {}) - abs_cname = column.names["."] - _columns = columns_for_table.get(abs_cname) - if not _columns: - _columns = columns_for_table[abs_cname] = [] - _columns.append(column) - self.count += 1 + existing_columns = columns_for_table.setdefault(column.names["."], []) + + for canonical in existing_columns: + if canonical is column: + return canonical + if canonical.es_type == column.es_type: + set_default(column.names, canonical.names) + for key in Column.__slots__: + canonical[key] = column[key] + return canonical + existing_columns.append(column) + return column + + def _update_meta(self): + if not self.dirty: + return + + for mcl in self.data.get("meta.columns").values(): + for mc in mcl: + count = 0 + values = set() + objects = 0 + multi = 1 + for t, cs in self.data.items(): + for c, css in cs.items(): + for column in css: + value = column[mc.names["."]] + if value == None: + pass + else: + count += 1 + if isinstance(value, list): + multi = max(multi, len(value)) + try: + values |= set(value) + except Exception: + objects += len(value) + elif isinstance(value, Mapping): + objects += 1 + else: + values.add(value) + mc.count = count + mc.cardinality = len(values) + objects + mc.partitions = jx.sort(values) + mc.multi = multi + mc.last_updated = Date.now() + self.dirty = False def __iter__(self): + self._update_meta() for t, cs in self.data.items(): for c, css in cs.items(): for column in css: yield column def __len__(self): - return self.count + return self.data['meta.columns']['es_index'].count def update(self, command): + self.dirty = True try: command = wrap(command) eq = command.where.eq @@ -81,62 +133,84 @@ class ColumnList(Container): columns = self.find(eq.es_index, eq.name) columns = [c for c in columns if all(get_attr(c, k) == v for k, v in eq.items())] else: - columns = list(self) - columns = jx.filter(columns, command.where) + with self.locker: + columns = list(self) + columns = jx.filter(columns, command.where) - for col in list(columns): - for k in command["clear"]: - if k == ".": - columns.remove(col) - else: - col[k] = None + with self.locker: + for col in list(columns): + for k in command["clear"]: + if k == ".": + columns.remove(col) + else: + col[k] = None - for k, v in command.set.items(): - col[k] = v + for k, v in command.set.items(): + col[k] = v except Exception as e: Log.error("should not happen", cause=e) def query(self, query): - query.frum = self.__iter__() - output = jx.run(query) + with self.locker: + self._update_meta() + query.frum = self.__iter__() + output = jx.run(query) return output def groupby(self, keys): - return jx.groupby(self.__iter__(), keys) + with self.locker: + self._update_meta() + return jx.groupby(self.__iter__(), keys) @property def schema(self): - return wrap({k: set(v) for k, v in self.data["meta.columns"].items()}) + if not self._schema: + with self.locker: + self._update_meta() + self._schema = Schema(".", [c for cs in self.data["meta.columns"].values() for c in cs]) + return self._schema + + @property + def namespace(self): + return self def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ - output = [ - { - "table": concat_field(c.es_index, untype_path(table)), - "name": untype_path(name), - "cardinality": c.cardinality, - "es_column": c.es_column, - "es_index": c.es_index, - "last_updated": c.last_updated, - "count": c.count, - "nested_path": [unnest_path(n) for n in c.nested_path], - "type": c.type - } - for tname, css in self.data.items() - for cname, cs in css.items() - for c in cs - if c.type not in STRUCT # and c.es_column != "_id" - for table, name in c.names.items() - ] - if not self.meta_schema: - self.meta_schema = get_schema_from_list("meta\\.columns", output) + with self.locker: + self._update_meta() + output = [ + { + "table": concat_field(c.es_index, untype_path(table)), + "name": untype_path(name), + "cardinality": c.cardinality, + "es_column": c.es_column, + "es_index": c.es_index, + "last_updated": c.last_updated, + "count": c.count, + "nested_path": [unnest_path(n) for n in c.nested_path], + "es_type": c.es_type, + "type": c.jx_type + } + for tname, css in self.data.items() + for cname, cs in css.items() + for c in cs + if c.jx_type not in STRUCT # and c.es_column != "_id" + for table, name in c.names.items() + ] from jx_python.containers.list_usingPythonList import ListContainer - return ListContainer("meta\\.columns", data=output, schema=self.meta_schema) + return ListContainer( + self.name, + data=output, + schema=jx_base.Schema( + "meta.columns", + SIMPLE_METADATA_COLUMNS + ) + ) def get_schema_from_list(table_name, frum): @@ -169,11 +243,13 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns): names={table_name: full_name}, es_column=full_name, es_index=".", - type="undefined", + jx_type=python_type_to_json_type[d.__class__], + es_type=row_type, nested_path=nested_path ) columns.add(column) - column.type = _merge_type[column.type][row_type] + column.es_type = _merge_type[column.es_type][row_type] + column.jx_type = _merge_type[column.jx_type][row_type] else: for name, value in d.items(): full_name = concat_field(parent, name) @@ -183,7 +259,7 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns): names={table_name: full_name}, es_column=full_name, es_index=".", - type="undefined", + es_type="undefined", nested_path=nested_path ) columns.add(column) @@ -199,20 +275,87 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns): this_type = "nested" else: this_type = _type_to_name[value.__class__] - - new_type = _merge_type[column.type][this_type] - if new_type == None: - Log.error("can not combine {{type1}} with {{type2}} for column {{column}}", type1=column.type, type2=this_type, column=full_name) - column.type = new_type + new_type = _merge_type[column.es_type][this_type] + column.es_type = new_type if this_type == "object": _get_schema_from_list([value], table_name, full_name, nested_path, columns) elif this_type == "nested": np = listwrap(nested_path) - newpath = unwraplist([join_field(split_field(np[0])+[name])]+np) + newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np) _get_schema_from_list(value, table_name, full_name, newpath, columns) +METADATA_COLUMNS = ( + [ + Column( + names={".": c}, + es_index="meta.columns", + es_column=c, + es_type="string", + nested_path=ROOT_PATH + ) + for c in ["es_type", "jx_type", "nested_path", "es_column", "es_index"] + ] + [ + Column( + es_index="meta.columns", + names={".": c}, + es_column=c, + es_type="object", + nested_path=ROOT_PATH + ) + for c in ["names", "partitions"] + ] + [ + Column( + names={".": c}, + es_index="meta.columns", + es_column=c, + es_type="long", + nested_path=ROOT_PATH + ) + for c in ["count", "cardinality", "multi"] + ] + [ + Column( + names={".": "last_updated"}, + es_index="meta.columns", + es_column="last_updated", + es_type="time", + nested_path=ROOT_PATH + ) + ] +) + +SIMPLE_METADATA_COLUMNS = ( + [ + Column( + names={".": c}, + es_index="meta.columns", + es_column=c, + es_type="string", + nested_path=ROOT_PATH + ) + for c in ["table", "name", "type", "nested_path"] + ] + [ + Column( + names={".": c}, + es_index="meta.columns", + es_column=c, + es_type="long", + nested_path=ROOT_PATH + ) + for c in ["count", "cardinality", "multi"] + ] + [ + Column( + names={".": "last_updated"}, + es_index="meta.columns", + es_column="last_updated", + es_type="time", + nested_path=ROOT_PATH + ) + ] +) + + _type_to_name = { none_type: "undefined", NullType: "undefined", @@ -242,6 +385,7 @@ _merge_type = { "long": "long", "float": "float", "double": "double", + "number": "number", "string": "string", "object": "object", "nested": "nested" @@ -253,6 +397,7 @@ _merge_type = { "long": "long", "float": "float", "double": "double", + "number": "number", "string": "string", "object": None, "nested": None @@ -264,6 +409,7 @@ _merge_type = { "long": "long", "float": "float", "double": "double", + "number": "number", "string": "string", "object": None, "nested": None @@ -275,6 +421,7 @@ _merge_type = { "long": "long", "float": "double", "double": "double", + "number": "number", "string": "string", "object": None, "nested": None @@ -286,6 +433,7 @@ _merge_type = { "long": "double", "float": "float", "double": "double", + "number": "number", "string": "string", "object": None, "nested": None @@ -297,6 +445,19 @@ _merge_type = { "long": "double", "float": "double", "double": "double", + "number": "number", + "string": "string", + "object": None, + "nested": None + }, + "number": { + "undefined": "number", + "boolean": "number", + "integer": "number", + "long": "number", + "float": "number", + "double": "number", + "number": "number", "string": "string", "object": None, "nested": None @@ -308,6 +469,7 @@ _merge_type = { "long": "string", "float": "string", "double": "string", + "number": "string", "string": "string", "object": None, "nested": None @@ -319,6 +481,7 @@ _merge_type = { "long": None, "float": None, "double": None, + "number": None, "string": None, "object": "object", "nested": "nested" @@ -330,9 +493,9 @@ _merge_type = { "long": None, "float": None, "double": None, + "number": None, "string": None, "object": "nested", "nested": "nested" } } - diff --git a/vendor/jx_python/namespace/__init__.py b/vendor/jx_python/namespace/__init__.py index 27495a3..e69de29 100644 --- a/vendor/jx_python/namespace/__init__.py +++ b/vendor/jx_python/namespace/__init__.py @@ -1,59 +0,0 @@ -# encoding: utf-8 -# -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Author: Kyle Lahnakoski (kyle@lahnakoski.com) -# -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import - -from collections import Mapping - -from mo_dots import set_default, Data -from jx_base.query import QueryOp - - -class Namespace(object): - - def convert(self, expr): - raise NotImplementedError() - - def _convert_query(self, query): - output = QueryOp("from", None) - output.select = self._convert_clause(query.select) - output.where = self.convert(query.where) - output["from"] = self._convert_from(query["from"]) - output.edges = self._convert_clause(query.edges) - output.having = convert_list(self._convert_having, query.having) - output.window = convert_list(self._convert_window, query.window) - output.sort = self._convert_clause(query.sort) - output.format = query.format - - return output - - def _convert_from(self, frum): - raise NotImplementedError() - - def _convert_clause(self, clause): - raise NotImplementedError() - - def _convert_having(self, clause): - raise NotImplementedError() - - def _convert_window(self, clause): - raise NotImplementedError() - - -def convert_list(operator, operand): - if operand==None: - return None - elif isinstance(operand, Mapping): - return operator(operand) - else: - return map(operator, operand) - - diff --git a/vendor/jx_python/table.py b/vendor/jx_python/table.py index 96f0a9e..445a3df 100644 --- a/vendor/jx_python/table.py +++ b/vendor/jx_python/table.py @@ -10,10 +10,12 @@ from __future__ import unicode_literals from __future__ import division from __future__ import absolute_import + +import jx_base from mo_dots import Data -class Table(object): +class Table(jx_base.Table): __slots__ = ['header', 'data', 'meta'] diff --git a/vendor/mo_collections/relation.py b/vendor/mo_collections/relation.py index fbd6297..b848608 100644 --- a/vendor/mo_collections/relation.py +++ b/vendor/mo_collections/relation.py @@ -12,40 +12,61 @@ from __future__ import unicode_literals from __future__ import division from __future__ import absolute_import +from mo_logs import Log + class Relation_usingList(object): def __init__(self): - self.all=set() + self.all = set() def len(self): return len(self.all) - def add(self, key, value): - test = (key, value) - if test not in self.all: - self.all.add(test) - def testAndAdd(self, key, value): """ RETURN TRUE IF THIS RELATION IS NET-NEW """ test = (key, value) - if test not in self.all: - self.all.add(test) - return True - return False + output = test not in self.all + self.all.add(test) + return output def extend(self, key, values): for v in values: - self.add(key, v) + self[key] = v def __getitem__(self, key): + """ + USE THIS IF YOU ARE CONFIDENT THIS IS A MANY-TO-ONE MAPPING + RETURN THE SINGLE CO-DOMAIN OBJECT THIS key MAPS TO + """ + output = [v for k, v in self.all if k == key] + if not output: + return None + elif len(output) == 1: + return output[0] + else: + Log.error("Not allowed") + + def __setitem__(self, key, value): + self.all.add((key, value)) + + def get_domain(self, value): + """ + RETURN domain FOR GIVEN CODOMAIN + :param value: + :return: + """ + return [k for k, v in self.all if v == value] + + def get_codomain(self, key): """ RETURN AN ARRAY OF OBJECTS THAT key MAPS TO """ return [v for k, v in self.all if k == key] + class Relation(object): def __init__(self): self.map = dict() @@ -96,5 +117,3 @@ class Relation(object): def domain(self): return self.map.keys() - - diff --git a/vendor/mo_dots/__init__.py b/vendor/mo_dots/__init__.py index 14913a0..156cee0 100644 --- a/vendor/mo_dots/__init__.py +++ b/vendor/mo_dots/__init__.py @@ -411,6 +411,12 @@ def lower_match(value, candidates): def wrap(v): + """ + WRAP AS Data OBJECT FOR DATA PROCESSING: https://github.com/klahnakoski/mo-dots/tree/dev/docs + :param v: THE VALUE TO WRAP + :return: Data INSTANCE + """ + type_ = _get(v, "__class__") if type_ is dict: @@ -422,7 +428,7 @@ def wrap(v): elif type_ is list: return FlatList(v) elif type_ in generator_types: - return FlatList(list(v)) + return FlatList(list(unwrap(vv) for vv in v)) else: return v diff --git a/vendor/mo_dots/lists.py b/vendor/mo_dots/lists.py index 600e7a7..9db3958 100644 --- a/vendor/mo_dots/lists.py +++ b/vendor/mo_dots/lists.py @@ -19,12 +19,20 @@ from mo_dots.nones import Null _get = object.__getattribute__ _set = object.__setattr__ _emit_slice_warning = True + _datawrap = None +Log = None + def _late_import(): global _datawrap + global Log from mo_dots.objects import datawrap as _datawrap + try: + from mo_logs import Log + except Exception: + from mo_dots.utils import PoorLogger as Log _ = _datawrap @@ -33,6 +41,7 @@ class FlatList(list): """ ENCAPSULATES HANDING OF Nulls BY wrapING ALL MEMBERS AS NEEDED ENCAPSULATES FLAT SLICES ([::]) FOR USE IN WINDOW FUNCTIONS + https://github.com/klahnakoski/mo-dots/tree/dev/docs#flatlist-is-flat """ EMPTY = None @@ -50,7 +59,8 @@ class FlatList(list): if isinstance(index, slice): # IMPLEMENT FLAT SLICES (for i not in range(0, len(self)): assert self[i]==None) if index.step is not None: - Log = _late_import() + if not Log: + _late_import() Log.error("slice step must be None, do not know how to deal with values") length = len(_get(self, "list")) @@ -78,7 +88,8 @@ class FlatList(list): _list.append(None) _list[i] = unwrap(y) except Exception as e: - Log = _late_import() + if not Log: + _late_import() Log.error("problem", cause=e) def __getattribute__(self, key): @@ -95,20 +106,22 @@ class FlatList(list): """ simple `select` """ - if not _datawrap: + if not Log: _late_import() return FlatList(vals=[unwrap(coalesce(_datawrap(v), Null)[key]) for v in _get(self, "list")]) def select(self, key): - Log = _late_import() + if not Log: + _late_import() Log.error("Not supported. Use `get()`") def filter(self, _filter): return FlatList(vals=[unwrap(u) for u in (wrap(v) for v in _get(self, "list")) if _filter(u)]) def __delslice__(self, i, j): - Log = _late_import() + if not Log: + _late_import() Log.error("Can not perform del on slice: modulo arithmetic was performed on the parameters. You can try using clear()") def __clear__(self): @@ -134,8 +147,9 @@ class FlatList(list): global _emit_slice_warning if _emit_slice_warning: - _emit_slice_warning=False - Log = _late_import() + _emit_slice_warning = False + if not Log: + _late_import() Log.warning("slicing is broken in Python 2.7: a[i:j] == a[i+len(a), j] sometimes. Use [start:stop:step] (see https://github.com/klahnakoski/pyLibrary/blob/master/pyLibrary/dot/README.md#the-slice-operator-in-python27-is-inconsistent)") return self[i:j:] diff --git a/vendor/mo_files/__init__.py b/vendor/mo_files/__init__.py index 4832989..ff1816a 100644 --- a/vendor/mo_files/__init__.py +++ b/vendor/mo_files/__init__.py @@ -408,6 +408,10 @@ class File(object): class TempDirectory(File): + """ + A CONTEXT MANAGER FOR AN ALLOCATED, BUT UNOPENED TEMPORARY DIRECTORY + WILL BE DELETED WHEN EXITED + """ def __new__(cls): return File.__new__(cls, None) @@ -418,10 +422,14 @@ class TempDirectory(File): return self def __exit__(self, exc_type, exc_val, exc_tb): - Thread.run("delete "+self.name, delete_daemon, file=self) + Thread.run("delete dir "+self.name, delete_daemon, file=self) class TempFile(File): + """ + A CONTEXT MANAGER FOR AN ALLOCATED, BUT UNOPENED TEMPORARY FILE + WILL BE DELETED WHEN EXITED + """ def __new__(cls, *args, **kwargs): return object.__new__(cls) @@ -434,7 +442,7 @@ class TempFile(File): return self def __exit__(self, exc_type, exc_val, exc_tb): - Thread.run("delete "+self.name, delete_daemon, file=self) + Thread.run("delete file "+self.name, delete_daemon, file=self) def _copy(from_, to_): diff --git a/vendor/mo_future/__init__.py b/vendor/mo_future/__init__.py index 699bbad..8dcd592 100644 --- a/vendor/mo_future/__init__.py +++ b/vendor/mo_future/__init__.py @@ -37,8 +37,15 @@ if PY3: unichr = chr xrange = range - filter_type = type(filter(lambda x: True, [])) - generator_types = (collections.Iterable, filter_type) + def _gen(): + yield + + generator_types = ( + type(_gen()), + type(filter(lambda x: True, [])), + type({}.items()), + type({}.values()) + ) unichr = chr round = round diff --git a/vendor/mo_json_config/__init__.py b/vendor/mo_json_config/__init__.py index 464ea89..1c6544a 100644 --- a/vendor/mo_json_config/__init__.py +++ b/vendor/mo_json_config/__init__.py @@ -56,6 +56,8 @@ def expand(doc, doc_url="param://", params=None): ASSUMING YOU ALREADY PULED THE doc FROM doc_url, YOU CAN STILL USE THE EXPANDING FEATURE + USE mo_json_config.expand({}) TO ASSUME CURRENT WORKING DIRECTORY + :param doc: THE DATA STRUCTURE FROM JSON SOURCE :param doc_url: THE URL THIS doc CAME FROM (DEFAULT USES params AS A DOCUMENT SOURCE) :param params: EXTRA PARAMETERS NOT FOUND IN THE doc_url PARAMETERS (WILL SUPERSEDE PARAMETERS FROM doc_url) diff --git a/vendor/mo_kwargs/__init__.py b/vendor/mo_kwargs/__init__.py index bfa4bd8..1687f1d 100644 --- a/vendor/mo_kwargs/__init__.py +++ b/vendor/mo_kwargs/__init__.py @@ -97,9 +97,10 @@ def override(func): if e.message.startswith(func_name) and "takes at least" in e: missing = [p for p in params if str(p) not in packed] get_logger().error( - "Problem calling {{func_name}}: Expecting parameter {{missing}}", + "Problem calling {{func_name}}: Expecting parameter {{missing}}, given {{given}}", func_name=func_name, missing=missing, + given=packed.keys(), stack_depth=1 ) get_logger().error("Error dispatching call", e) diff --git a/vendor/mo_logs/__init__.py b/vendor/mo_logs/__init__.py index 3f50205..355204f 100644 --- a/vendor/mo_logs/__init__.py +++ b/vendor/mo_logs/__init__.py @@ -103,7 +103,13 @@ class Log(object): @classmethod def stop(cls): - from mo_logs import profiles + """ + DECONSTRUCTS ANY LOGGING, AND RETURNS TO DIRECT-TO-stdout LOGGING + EXECUTING MULUTIPLE TIMES IN A ROW IS SAFE, IT HAS NO NET EFFECT, IT STILL LOGS TO stdout + :return: NOTHING + """ + + from mo_threads import profiles if cls.cprofiler and hasattr(cls, "settings"): if cls.cprofiler == None: @@ -429,7 +435,6 @@ class Log(object): trace = exceptions.extract_stack(stack_depth + 1) e = Except(exceptions.ERROR, template, params, cause, trace) - str_e = text_type(e) error_mode = cls.error_mode with suppress_exception: @@ -443,7 +448,7 @@ class Log(object): ) cls.error_mode = error_mode - sys.stderr.write(str_e.encode('utf8')) + sys.stderr.write(str(e)) def write(self): @@ -475,6 +480,10 @@ def write_profile(profile_settings, stats): stats_file.write(convert.list2tab(stats)) +def _same_frame(frameA, frameB): + return (frameA.line, frameA.file) == (frameB.line, frameB.file) + + # GET THE MACHINE METADATA machine_metadata = wrap({ "pid": os.getpid(), diff --git a/vendor/mo_logs/exceptions.py b/vendor/mo_logs/exceptions.py index 27fe059..0a494b4 100644 --- a/vendor/mo_logs/exceptions.py +++ b/vendor/mo_logs/exceptions.py @@ -55,6 +55,13 @@ class Except(Exception): @classmethod def wrap(cls, e, stack_depth=0): + """ + ENSURE THE STACKTRACE AND CAUSAL CHAIN IS CAPTURED, PLUS ADD FEATURES OF Except + + :param e: AN EXCEPTION OF ANY TYPE + :param stack_depth: HOW MANY CALLS TO TAKE OFF THE TOP OF THE STACK TRACE + :return: A Except OBJECT OF THE SAME + """ if e == None: return Null elif isinstance(e, (list, Except)): diff --git a/vendor/mo_logs/log_usingElasticSearch.py b/vendor/mo_logs/log_usingElasticSearch.py index 76caa92..62cf1d3 100644 --- a/vendor/mo_logs/log_usingElasticSearch.py +++ b/vendor/mo_logs/log_usingElasticSearch.py @@ -37,6 +37,10 @@ class StructuredLogger_usingElasticSearch(StructuredLogger): """ settings ARE FOR THE ELASTICSEARCH INDEX """ + kwargs.timeout = Duration(coalesce(self.es.settings.timeout, "30second")).seconds + kwargs.retry.times = coalesce(self.es.settings.retry.times, 3) + kwargs.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)).seconds + self.es = Cluster(kwargs).get_or_create_index( schema=mo_json.json2value(value2json(SCHEMA), leaves=True), limit_replicas=True, @@ -46,8 +50,7 @@ class StructuredLogger_usingElasticSearch(StructuredLogger): self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) - self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) - self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) + Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): diff --git a/vendor/mo_logs/log_usingStream.py b/vendor/mo_logs/log_usingStream.py index ff8f934..a34eb64 100644 --- a/vendor/mo_logs/log_usingStream.py +++ b/vendor/mo_logs/log_usingStream.py @@ -22,8 +22,9 @@ from mo_logs.strings import expand_template class StructuredLogger_usingStream(StructuredLogger): def __init__(self, stream): - self.locker = allocate_lock() try: + self.locker = allocate_lock() + self.flush = stream.flush if stream in (sys.stdout, sys.stderr): if PY3: self.writer = stream.write @@ -33,8 +34,8 @@ class StructuredLogger_usingStream(StructuredLogger): self.writer = _UTF8Encoder(stream).write else: self.writer = stream.write - except Exception as e: - sys.stderr("can not handle") + except Exception as _: + sys.stderr.write("can not handle") def write(self, template, params): value = expand_template(template, params) @@ -45,7 +46,7 @@ class StructuredLogger_usingStream(StructuredLogger): self.locker.release() def stop(self): - pass + self.flush() class _UTF8Encoder(object): @@ -56,5 +57,5 @@ class _UTF8Encoder(object): def write(self, v): try: self.stream.write(v.encode('utf8')) - except Exception as e: - sys.stderr("can not handle") + except Exception as _: + sys.stderr.write("can not handle") diff --git a/vendor/mo_logs/log_usingThreadedStream.py b/vendor/mo_logs/log_usingThreadedStream.py index ba20b48..8e1f47a 100644 --- a/vendor/mo_logs/log_usingThreadedStream.py +++ b/vendor/mo_logs/log_usingThreadedStream.py @@ -93,7 +93,9 @@ def time_delta_pusher(please_stop, appender, queue, interval): next_run = time() + interval while not please_stop: + Thread.current().cprofiler.disable() (Till(till=next_run) | please_stop).wait() + Thread.current().cprofiler.enable() next_run = time() + interval logs = queue.pop_all() if not logs: @@ -116,7 +118,7 @@ def time_delta_pusher(please_stop, appender, queue, interval): appender(u"\n".join(lines) + u"\n") except Exception as e: - sys.stderr.write(b"Trouble with appender: " + str(e.__class__.__name__) + b"\n") + sys.stderr.write(str("Trouble with appender: ") + str(e.__class__.__name__) + str("\n")) # SWALLOW ERROR, MUST KEEP RUNNING diff --git a/vendor/mo_logs/startup.py b/vendor/mo_logs/startup.py index 6b465fd..8862018 100644 --- a/vendor/mo_logs/startup.py +++ b/vendor/mo_logs/startup.py @@ -20,7 +20,7 @@ import tempfile import mo_json_config from mo_files import File from mo_logs import Log -from mo_dots import listwrap, wrap, unwrap +from mo_dots import listwrap, wrap, unwrap, coalesce # PARAMETERS MATCH argparse.ArgumentParser.add_argument() @@ -58,41 +58,34 @@ def argparse(defs): return wrap(output) -def read_settings(filename=None, defs=None, env_filename=None): +def read_settings(filename=None, defs=None): """ :param filename: Force load a file :param defs: arguments you want to accept - :param env_filename: A config file from an environment variable (a fallback config file, if no other provided) + :param default_filename: A config file from an environment variable (a fallback config file, if no other provided) :return: """ # READ SETTINGS - if filename: - settings_file = File(filename) - if not settings_file.exists: - Log.error("Can not file settings file {{filename}}", { - "filename": settings_file.abspath - }) - settings = mo_json_config.get("file:///" + settings_file.abspath) - if defs: - settings.args = argparse(defs) - return settings - else: - defs = listwrap(defs) - defs.append({ - "name": ["--config", "--settings", "--settings-file", "--settings_file"], - "help": "path to JSON file with settings", - "type": str, - "dest": "filename", - "default": "config.json", - "required": False - }) - args = argparse(defs) + defs = listwrap(defs) + defs.append({ + "name": ["--config", "--settings", "--settings-file", "--settings_file"], + "help": "path to JSON file with settings", + "type": str, + "dest": "filename", + "default": None, + "required": False + }) + args = argparse(defs) - if env_filename: - args.filename = env_filename - settings = mo_json_config.get("file://" + args.filename.replace(os.sep, "/")) - settings.args = args - return settings + args.filename = coalesce(filename, args.filename, "./config.json") + settings_file = File(args.filename) + if not settings_file.exists: + Log.error("Can not read configuration file {{filename}}", { + "filename": settings_file.abspath + }) + settings = mo_json_config.get("file:///" + settings_file.abspath) + settings.args = args + return settings # snagged from https://github.com/pycontribs/tendo/blob/master/tendo/singleton.py (under licence PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2) diff --git a/vendor/mo_math/__init__.py b/vendor/mo_math/__init__.py index 82277c9..300a94d 100644 --- a/vendor/mo_math/__init__.py +++ b/vendor/mo_math/__init__.py @@ -297,6 +297,12 @@ def MIN(values, *others): def MAX(values, *others): + """ + DECISIVE MAX + :param values: + :param others: + :return: + """ if others: from mo_logs import Log diff --git a/vendor/mo_testing/fuzzytestcase.py b/vendor/mo_testing/fuzzytestcase.py index 09d8207..4d20a87 100644 --- a/vendor/mo_testing/fuzzytestcase.py +++ b/vendor/mo_testing/fuzzytestcase.py @@ -86,6 +86,8 @@ def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta= return elif test is expected: return + elif isinstance(expected, text_type): + assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta) elif isinstance(test, UniqueIndex): if test ^ expected: Log.error("Sets do not match") @@ -196,7 +198,6 @@ def assertAlmostEqualValue(test, expected, digits=None, places=None, msg=None, d if diff < Math.ceiling(Math.log10(abs(test)))-places: return - standardMsg = expand_template("{{test|json}} != {{expected|json}} within {{places}} places", locals()) raise AssertionError(coalesce(msg, "") + ": (" + standardMsg + ")") diff --git a/vendor/mo_threads/__init__.py b/vendor/mo_threads/__init__.py index 13bbb3f..c798d7d 100644 --- a/vendor/mo_threads/__init__.py +++ b/vendor/mo_threads/__init__.py @@ -15,15 +15,21 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from mo_future import get_function_name - +from mo_logs import Log from mo_threads.lock import Lock -from mo_threads.signal import Signal -from mo_threads.till import Till -from mo_threads.threads import Thread, THREAD_STOP, THREAD_TIMEOUT -from mo_threads.queues import Queue -from mo_threads.queues import ThreadedQueue from mo_threads.multiprocess import Process +from mo_threads.queues import Queue, ThreadedQueue +from mo_threads.signal import Signal +from mo_threads.threads import Thread, THREAD_STOP, THREAD_TIMEOUT, MainThread, stop_main_thread, MAIN_THREAD +from mo_threads.till import Till + +Log.cprofiler_stats = Queue("cprofiler stats") # ACCUMULATION OF STATS FROM ALL THREADS + +MAIN_THREAD.timers = Thread.run("timers daemon", till.daemon) +MAIN_THREAD.children.remove(threads.MAIN_THREAD.timers) + + + # from threading import Thread as _threading_Thread @@ -78,3 +84,4 @@ from mo_threads.multiprocess import Process # _threading_Thread.setDaemon = _setDaemon # # + diff --git a/vendor/mo_threads/lock.py b/vendor/mo_threads/lock.py index 42c79be..89b1768 100644 --- a/vendor/mo_threads/lock.py +++ b/vendor/mo_threads/lock.py @@ -49,7 +49,7 @@ def _late_import(): class Lock(object): """ - A NON-RE-ENTRANT LOCK WITH wait() AND + A NON-RE-ENTRANT LOCK WITH wait() """ __slots__ = ["name", "lock", "waiting"] @@ -77,7 +77,7 @@ class Lock(object): """ THE ASSUMPTION IS wait() WILL ALWAYS RETURN WITH THE LOCK ACQUIRED :param till: WHEN TO GIVE UP WAITING FOR ANOTHER THREAD TO SIGNAL - :return: True IF SIGNALED TO GO, False IF TIMEOUT HAPPENED + :return: True IF SIGNALED TO GO, False IF till WAS SIGNALED """ waiter = Signal() if self.waiting: diff --git a/vendor/mo_logs/profiles.py b/vendor/mo_threads/profiles.py similarity index 83% rename from vendor/mo_logs/profiles.py rename to vendor/mo_threads/profiles.py index d3dcf1f..3013382 100644 --- a/vendor/mo_logs/profiles.py +++ b/vendor/mo_threads/profiles.py @@ -16,26 +16,12 @@ import pstats from datetime import datetime from time import clock -from mo_dots import Data -from mo_dots import wrap - +from mo_dots import Data, wrap, Null +from mo_logs import Log ON = False profiles = {} -_Log = None - - -def _late_import(): - global _Log - - from mo_logs import Log as _Log - from mo_threads import Queue - - if _Log.cprofiler_stats == None: - _Log.cprofiler_stats = Queue("cprofiler stats") # ACCUMULATION OF STATS FROM ALL THREADS - - class Profiler(object): """ VERY SIMPLE PROFILER FOR USE IN with STATEMENTS @@ -48,13 +34,12 @@ class Profiler(object): output = profiles.get(args[0]) if output: return output - output = object.__new__(cls, *args) + output = object.__new__(cls) return output def __init__(self, description): - from jx_python.windows import Stats - if ON and not hasattr(self, "description"): + from jx_python.windows import Stats self.description = description self.samples = [] self.stats = Stats()() @@ -127,20 +112,25 @@ class CProfiler(object): """ def __init__(self): - if not _Log: - _late_import() self.cprofiler = None def __enter__(self): - if _Log.cprofiler: - _Log.note("starting cprofile") + if Log.cprofiler: + Log.note("starting cprofile") self.cprofiler = cProfile.Profile() self.cprofiler.enable() def __exit__(self, exc_type, exc_val, exc_tb): - if self.cprofiler: + if self.cprofiler is not None: self.cprofiler.disable() - _Log.cprofiler_stats.add(pstats.Stats(self.cprofiler)) + Log.cprofiler_stats.add(pstats.Stats(self.cprofiler)) del self.cprofiler - _Log.note("done cprofile") + Log.note("done cprofile") + def enable(self): + if self.cprofiler is not None: + return self.cprofiler.enable() + + def disable(self): + if self.cprofiler is not None: + return self.cprofiler.disable() diff --git a/vendor/mo_threads/queues.py b/vendor/mo_threads/queues.py index 34e5310..cf77877 100644 --- a/vendor/mo_threads/queues.py +++ b/vendor/mo_threads/queues.py @@ -22,14 +22,12 @@ from datetime import datetime from time import time from mo_dots import coalesce, Null -from mo_threads import Lock, Signal, Thread, THREAD_STOP, THREAD_TIMEOUT, Till - from mo_logs import Log +from mo_threads.lock import Lock +from mo_threads.signal import Signal +from mo_threads.threads import THREAD_STOP, THREAD_TIMEOUT, Thread +from mo_threads.till import Till -_convert = None -_Except = None -_CProfiler = None -_Log = None DEBUG = False # MAX_DATETIME = datetime(2286, 11, 20, 17, 46, 39) @@ -37,23 +35,6 @@ DEFAULT_WAIT_TIME = 10 * 60 # SECONDS datetime.strptime('2012-01-01', '%Y-%m-%d') # http://bugs.python.org/issue7980 - -def _late_import(): - global _convert - global _Except - global _CProfiler - global _Log - - from mo_logs.exceptions import Except as _Except - from mo_logs.profiles import CProfiler as _CProfiler - from mo_logs import Log as _Log - - _ = _convert - _ = _Except - _ = _CProfiler - _ = _Log - - class Queue(object): """ SIMPLE MESSAGE QUEUE, multiprocessing.Queue REQUIRES SERIALIZATION, WHICH @@ -66,9 +47,6 @@ class Queue(object): silent - COMPLAIN IF THE READERS ARE TOO SLOW unique - SET True IF YOU WANT ONLY ONE INSTANCE IN THE QUEUE AT A TIME """ - if not _Log: - _late_import() - self.name = name self.max = coalesce(max, 2 ** 10) self.silent = silent @@ -88,10 +66,10 @@ class Queue(object): if value is not None: yield value except Exception as e: - _Log.warning("Tell me about what happened here", e) + Log.warning("Tell me about what happened here", e) if not self.silent: - _Log.note("queue iterator is done") + Log.note("queue iterator is done") def add(self, value, timeout=None): with self.lock: @@ -103,7 +81,7 @@ class Queue(object): self._wait_for_queue_space(timeout=timeout) if self.please_stop and not self.allow_add_after_close: - _Log.error("Do not add to closed queue") + Log.error("Do not add to closed queue") else: if self.unique: if value not in self.queue: @@ -117,7 +95,7 @@ class Queue(object): SNEAK value TO FRONT OF THE QUEUE """ if self.please_stop and not self.allow_add_after_close: - _Log.error("Do not push to closed queue") + Log.error("Do not push to closed queue") with self.lock: self._wait_for_queue_space() @@ -132,12 +110,12 @@ class Queue(object): """ if till is not None and not isinstance(till, Signal): - _Log.error("Expecting a signal") + Log.error("Expecting a signal") return Null, self.pop(till=till) def extend(self, values): if self.please_stop and not self.allow_add_after_close: - _Log.error("Do not push to closed queue") + Log.error("Do not push to closed queue") with self.lock: # ONCE THE queue IS BELOW LIMIT, ALLOW ADDING MORE @@ -171,16 +149,16 @@ class Queue(object): if timeout != None: time_to_stop_waiting = now + timeout else: - time_to_stop_waiting = Null + time_to_stop_waiting = None if self.next_warning < now: self.next_warning = now + wait_time while not self.please_stop and len(self.queue) >= self.max: if now > time_to_stop_waiting: - if not _Log: + if not Log: _late_import() - _Log.error(THREAD_TIMEOUT) + Log.error(THREAD_TIMEOUT) if self.silent: self.lock.wait(Till(till=time_to_stop_waiting)) @@ -190,7 +168,7 @@ class Queue(object): now = time() if self.next_warning < now: self.next_warning = now + wait_time - _Log.alert( + Log.alert( "Queue by name of {{name|quote}} is full with ({{num}} items), thread(s) have been waiting {{wait_time}} sec", name=self.name, num=len(self.queue), @@ -215,7 +193,7 @@ class Queue(object): :return: A value, or a THREAD_STOP or None """ if till is not None and not isinstance(till, Signal): - _Log.error("expecting a signal") + Log.error("expecting a signal") with self.lock: while True: @@ -229,7 +207,7 @@ class Queue(object): break return None if DEBUG or not self.silent: - _Log.note(self.name + " queue stopped") + Log.note(self.name + " queue stopped") return THREAD_STOP def pop_all(self): @@ -289,13 +267,13 @@ class ThreadedQueue(Queue): # BE CAREFUL! THE THREAD MAKING THE CALL WILL NOT BE YOUR OWN! # DEFAULT BEHAVIOUR: THIS WILL KEEP RETRYING WITH WARNINGS ): - if not _Log: + if not Log: _late_import() if period !=None and not isinstance(period, (int, float, long)): - if not _Log: + if not Log: _late_import() - _Log.error("Expecting a float for the period") + Log.error("Expecting a float for the period") batch_size = coalesce(batch_size, int(max_size / 2) if max_size else None, 900) max_size = coalesce(max_size, batch_size * 2) # REASONABLE DEFAULT @@ -328,7 +306,7 @@ class ThreadedQueue(Queue): item = self.pop() now = time() if now > last_push + period: - # _Log.note("delay next push") + # Log.note("delay next push") next_push = Till(till=now + period) else: item = self.pop(till=next_push) @@ -349,13 +327,13 @@ class ThreadedQueue(Queue): try: error_target(e, _buffer) except Exception as f: - _Log.warning( + Log.warning( "`error_target` should not throw, just deal", name=name, cause=f ) else: - _Log.warning( + Log.warning( "Unexpected problem", name=name, cause=e @@ -374,13 +352,13 @@ class ThreadedQueue(Queue): try: error_target(e, _buffer) except Exception as f: - _Log.warning( + Log.warning( "`error_target` should not throw, just deal", name=name, cause=f ) else: - _Log.warning( + Log.warning( "Problem with {{name}} pushing {{num}} items to data sink", name=name, num=len(_buffer), @@ -405,8 +383,8 @@ class ThreadedQueue(Queue): # from jx_python import jx # # biggest = jx.sort(sizes, "size").last().id - # _Log.note("Big record {{id}}", id=biggest) - # _Log.note("{{name}} has {{num}} items with json size of {{size|comma}}", name=self.name, num=len(self.queue), size=size) + # Log.note("Big record {{id}}", id=biggest) + # Log.note("{{name}} has {{num}} items with json size of {{size|comma}}", name=self.name, num=len(self.queue), size=size) return self def extend(self, values): @@ -415,7 +393,7 @@ class ThreadedQueue(Queue): self._wait_for_queue_space() if not self.please_stop: self.queue.extend(values) - _Log.note("{{name}} has {{num}} items", name=self.name, num=len(self.queue)) + Log.note("{{name}} has {{num}} items", name=self.name, num=len(self.queue)) return self def __enter__(self): @@ -430,3 +408,5 @@ class ThreadedQueue(Queue): def stop(self): self.add(THREAD_STOP) self.thread.join() + + diff --git a/vendor/mo_threads/threads.py b/vendor/mo_threads/threads.py index 798c2ef..85f2204 100644 --- a/vendor/mo_threads/threads.py +++ b/vendor/mo_threads/threads.py @@ -15,20 +15,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals +import signal as _signal import sys - from copy import copy from datetime import datetime, timedelta from time import sleep -from mo_future import get_ident, start_new_thread, interrupt_main - from mo_dots import Data, unwraplist, Null +from mo_future import get_ident, start_new_thread, interrupt_main, get_function_name, text_type from mo_logs import Log, Except -from mo_logs.profiles import CProfiler -from mo_threads import Till, Lock, Signal, till - -from mo_threads.signal import AndSignals +from mo_threads.lock import Lock +from mo_threads.profiles import CProfiler +from mo_threads.signal import AndSignals, Signal +from mo_threads.till import Till DEBUG = False @@ -81,8 +80,11 @@ class MainThread(object): def __init__(self): self.name = "Main Thread" self.id = get_ident() + self.please_stop = Signal() self.children = [] + self.stop_logging = Log.stop self.timers = None + self.cprofiler = Null def add_child(self, child): self.children.append(child) @@ -96,9 +98,15 @@ class MainThread(object): def stop(self): """ BLOCKS UNTIL ALL THREADS HAVE STOPPED + THEN RUNS sys.exit(0) """ - join_errors = [] + self.please_stop.go() + self_thread = Thread.current() + if self_thread != MAIN_THREAD or self_thread != self: + Log.error("Only the main thread can call stop() on main thread") + + join_errors = [] children = copy(self.children) for c in reversed(children): if DEBUG and c.name: @@ -122,11 +130,57 @@ class MainThread(object): if join_errors: Log.error("Problem while stopping {{name|quote}}", name=self.name, cause=unwraplist(join_errors)) + self.stop_logging() self.timers.stop() self.timers.join() if DEBUG: Log.note("Thread {{name|quote}} now stopped", name=self.name) + sys.exit(0) + + def wait_for_shutdown_signal( + self, + please_stop=False, # ASSIGN SIGNAL TO STOP EARLY + allow_exit=False, # ALLOW "exit" COMMAND ON CONSOLE TO ALSO STOP THE APP + wait_forever=True # IGNORE CHILD THREADS, NEVER EXIT. False => IF NO CHILD THREADS LEFT, THEN EXIT + ): + """ + FOR USE BY PROCESSES THAT NEVER DIE UNLESS EXTERNAL SHUTDOWN IS REQUESTED + + CALLING THREAD WILL SLEEP UNTIL keyboard interrupt, OR please_stop, OR "exit" + + :param please_stop: + :param allow_exit: + :param wait_forever:: Assume all needed threads have been launched. When done + :return: + """ + self_thread = Thread.current() + if self_thread != MAIN_THREAD or self_thread != self: + Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)") + + if isinstance(please_stop, Signal): + self.please_stop.on_go(please_stop.go) + else: + please_stop = self.please_stop + + if not wait_forever: + # TRIGGER SIGNAL WHEN ALL CHILDREN THEADS ARE DONE + pending = copy(self_thread.children) + all = AndSignals(please_stop, len(pending)) + for p in pending: + p.stopped.on_go(all.done) + + try: + if allow_exit: + _wait_for_exit(please_stop) + else: + _wait_for_interrupt(please_stop) + except KeyboardInterrupt as _: + Log.alert("SIGINT Detected! Stopping...") + except SystemExit as _: + Log.alert("SIGTERM Detected! Stopping...") + finally: + self.stop() class Thread(object): @@ -152,7 +206,7 @@ class Thread(object): self.thread = None self.stopped = Signal("stopped signal for " + self.name) - self.cprofiler = None + self.cprofiler = Null self.children = [] if "parent_thread" in kwargs: @@ -162,7 +216,6 @@ class Thread(object): self.parent = Thread.current() self.parent.add_child(self) - def __enter__(self): return self @@ -210,7 +263,8 @@ class Thread(object): try: if self.target is not None: a, k, self.args, self.kwargs = self.args, self.kwargs, None, None - with CProfiler(): # PROFILE IN HERE SO THAT __exit__() IS RUN BEFORE THREAD MARKED AS stopped + self.cprofiler = CProfiler() + with self.cprofiler: # PROFILE IN HERE SO THAT __exit__() IS RUN BEFORE THREAD MARKED AS stopped response = self.target(*a, **k) with self.synch_lock: self.end_of_thread = Data(response=response) @@ -226,7 +280,7 @@ class Thread(object): try: Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e) except Exception: - sys.stderr.write(b"ERROR in thread: " + str(self.name) + b" " + str(e) + b"\n") + sys.stderr.write(str("ERROR in thread: " + self.name + " " + text_type(e) + "\n")) finally: try: children = copy(self.children) @@ -260,9 +314,9 @@ class Thread(object): if DEBUG: Log.warning("problem with thread {{name|quote}}", cause=e, name=self.name) finally: - self.stopped.go() if DEBUG: Log.note("thread {{name|quote}} is done", name=self.name) + self.stopped.go() def is_alive(self): return not self.stopped @@ -293,7 +347,9 @@ class Thread(object): @staticmethod def run(name, target, *args, **kwargs): # ENSURE target HAS please_stop ARGUMENT - if "please_stop" not in target.__code__.co_varnames: + if get_function_name(target) == 'wrapper': + pass # GIVE THE override DECORATOR A PASS + elif "please_stop" not in target.__code__.co_varnames: Log.error("function must have please_stop argument for signalling emergency shutdown") Thread.num_threads += 1 @@ -302,48 +358,6 @@ class Thread(object): output.start() return output - @staticmethod - def wait_for_shutdown_signal( - please_stop=False, # ASSIGN SIGNAL TO STOP EARLY - allow_exit=False, # ALLOW "exit" COMMAND ON CONSOLE TO ALSO STOP THE APP - wait_forever=True # IGNORE CHILD THREADS, NEVER EXIT. False -> IF NO CHILD THREADS LEFT, THEN EXIT - ): - """ - FOR USE BY PROCESSES NOT EXPECTED TO EVER COMPLETE UNTIL EXTERNAL - SHUTDOWN IS REQUESTED - - SLEEP UNTIL keyboard interrupt, OR please_stop, OR "exit" - - :param please_stop: - :param allow_exit: - :param wait_forever:: Assume all needed threads have been launched. When done - :return: - """ - if not isinstance(please_stop, Signal): - please_stop = Signal() - - please_stop.on_go(lambda: start_new_thread(_stop_main_thread, ())) - - self_thread = Thread.current() - if self_thread != MAIN_THREAD: - Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)") - - if not wait_forever: - # TRIGGER SIGNAL WHEN ALL EXITING THREADS ARE DONE - pending = copy(self_thread.children) - all = AndSignals(please_stop, len(pending)) - for p in pending: - p.stopped.on_go(all.done) - - try: - if allow_exit: - _wait_for_exit(please_stop) - else: - _wait_for_interrupt(please_stop) - except (KeyboardInterrupt, SystemExit) as _: - Log.alert("SIGINT Detected! Stopping...") - finally: - please_stop.go() @staticmethod def current(): @@ -355,15 +369,26 @@ class Thread(object): return MAIN_THREAD -def _stop_main_thread(): +def stop_main_thread(*args): + global DEBUG + + DEBUG = True try: + if len(args): + Log.warning("exit with {{value}}", value=_describe_exit_codes.get(args[0], args[0])) + except Exception as _: + pass + finally: MAIN_THREAD.stop() - except Exception as e: - e = Except.wrap(e) - Log.warning("Problem with threads", cause=e) - sys.exit(0) +_describe_exit_codes = { + _signal.SIGTERM: "SIGTERM", + _signal.SIGINT: "SIGINT" +} + +_signal.signal(_signal.SIGTERM, stop_main_thread) +_signal.signal(_signal.SIGINT, stop_main_thread) def _wait_for_exit(please_stop): @@ -416,11 +441,10 @@ def _interrupt_main_safely(): # WE COULD BE INTERRUPTING SELF pass + MAIN_THREAD = MainThread() ALL_LOCK = Lock("threads ALL_LOCK") ALL = dict() ALL[get_ident()] = MAIN_THREAD -MAIN_THREAD.timers = Thread.run("timers daemon", till.daemon) -MAIN_THREAD.children.remove(MAIN_THREAD.timers) diff --git a/vendor/mo_threads/till.py b/vendor/mo_threads/till.py index 79d11c7..4d543f6 100644 --- a/vendor/mo_threads/till.py +++ b/vendor/mo_threads/till.py @@ -15,9 +15,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -from mo_future import allocate_lock as _allocate_lock from time import sleep, time +from weakref import ref +from mo_future import allocate_lock as _allocate_lock from mo_future import text_type from mo_threads.signal import Signal @@ -40,7 +41,7 @@ class Till(Signal): if not Till.enabled: return Till.done elif till == None and timeout == None and seconds == None: - return Till.done + return None else: return object.__new__(cls) @@ -70,7 +71,7 @@ class Till(Signal): with Till.locker: if timeout != None: Till.next_ping = min(Till.next_ping, timeout) - Till.new_timers.append((timeout, self)) + Till.new_timers.append((timeout, ref(self))) Till.done.go() @@ -108,13 +109,17 @@ def daemon(please_stop): new_timers, Till.new_timers = Till.new_timers, [] if DEBUG and new_timers: - Log.note("new timers: {{timers}}", timers=[t for t, s in new_timers]) + if len(new_timers) > 5: + Log.note("{{num}} new timers", num=len(new_timers)) + else: + Log.note("new timers: {{timers}}", timers=[t for t, _ in new_timers]) sorted_timers.extend(new_timers) if sorted_timers: - sorted_timers.sort(key=lambda r: r[0]) - for i, (t, s) in enumerate(sorted_timers): + sorted_timers.sort(key=actual_time) + for i, rec in enumerate(sorted_timers): + t = actual_time(rec) if now < t: work, sorted_timers = sorted_timers[:i], sorted_timers[i:] Till.next_ping = min(Till.next_ping, sorted_timers[0][0]) @@ -126,15 +131,17 @@ def daemon(please_stop): if DEBUG: Log.note( "done: {{timers}}. Remaining {{pending}}", - timers=[t for t, s in work], - pending=[t for t, s in sorted_timers] + timers=[t for t, s in work] if len(work) <= 5 else len(work), + pending=[t for t, s in sorted_timers] if len(sorted_timers) <= 5 else len(sorted_timers) ) - for t, s in work: - s.go() + for t, r in work: + s = r() + if s is not None: + s.go() except Exception as e: - Log.warning("timer shutdown", cause=e) + Log.warning("unexpected timer shutdown", cause=e) finally: if DEBUG: Log.alert("TIMER SHUTDOWN") @@ -145,4 +152,5 @@ def daemon(please_stop): for t, s in new_work + sorted_timers: s.go() - +def actual_time(rec): + return 0 if rec[1]() is None else rec[0] diff --git a/vendor/mo_times/dates.py b/vendor/mo_times/dates.py index 0f45e1f..ecc1ad8 100644 --- a/vendor/mo_times/dates.py +++ b/vendor/mo_times/dates.py @@ -71,7 +71,7 @@ class Date(object): def format(self, format="%Y-%m-%d %H:%M:%S"): try: - return unix2datetime(self.unix).strftime(format) + return text_type(unix2datetime(self.unix).strftime(format)) except Exception as e: from mo_logs import Log @@ -160,11 +160,15 @@ class Date(object): return self.add(-other) def __lt__(self, other): - other = Date(other) + try: + other = Date(other) + except Exception: + return False + return self.unix < other.unix def __eq__(self, other): - if other == None: + if other == None or other == '': return Null try: @@ -397,7 +401,7 @@ def unicode2Date(value, format=None): else: from mo_logs import Log - Log.error("Can not interpret {{value}} as a datetime", value= value) + Log.error("Can not interpret {{value}} as a datetime", value=value) DATETIME_EPOCH = datetime(1970, 1, 1) diff --git a/vendor/mo_times/timer.py b/vendor/mo_times/timer.py index aeebdfe..6c5acf2 100644 --- a/vendor/mo_times/timer.py +++ b/vendor/mo_times/timer.py @@ -37,6 +37,7 @@ class Timer(object): self.param = wrap(coalesce(param, {})) self.debug = debug self.silent = silent + self.agg = 0 self.start = 0 self.end = 0 self.interval = None @@ -51,6 +52,7 @@ class Timer(object): def __exit__(self, type, value, traceback): self.end = time() self.interval = self.end - self.start + self.agg += self.interval if self.debug: param = wrap(self.param) @@ -60,7 +62,15 @@ class Timer(object): @property def duration(self): + end = time() if not self.end: - return Duration(time() - self.start) + return Duration(end - self.start) return Duration(self.interval) + + @property + def total(self): + if not self.end: + Log.error("please ask for total time outside the context of measuring") + + return Duration(self.agg) diff --git a/vendor/pyLibrary/aws/s3.py b/vendor/pyLibrary/aws/s3.py index 8ed15b4..80f372c 100644 --- a/vendor/pyLibrary/aws/s3.py +++ b/vendor/pyLibrary/aws/s3.py @@ -11,17 +11,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -import StringIO import gzip import zipfile from tempfile import TemporaryFile import boto -from BeautifulSoup import BeautifulSoup from boto.s3.connection import Location -from mo_future import text_type +from bs4 import BeautifulSoup from mo_dots import wrap, Null, coalesce, unwrap, Data +from mo_future import text_type, StringIO from mo_kwargs import override from mo_logs import Log, Except from mo_logs.strings import utf82unicode, unicode2utf8 @@ -472,7 +471,7 @@ def strip_extension(key): def _unzip(compressed): - buff = StringIO.StringIO(compressed) + buff = StringIO(compressed) archive = zipfile.ZipFile(buff, mode='r') return archive.read(archive.namelist()[0]) diff --git a/vendor/pyLibrary/convert.py b/vendor/pyLibrary/convert.py index 135a628..aa70482 100644 --- a/vendor/pyLibrary/convert.py +++ b/vendor/pyLibrary/convert.py @@ -46,6 +46,15 @@ def string2datetime(value, format=None): return unix2datetime(Date(value, format).unix) +def string2boolean(value): + if value in ["true", "T"]: + return True + elif value in ["false", "F"]: + return False + else: + return None + + def str2datetime(value, format=None): return unix2datetime(Date(value, format).unix) diff --git a/vendor/pyLibrary/env/elasticsearch.py b/vendor/pyLibrary/env/elasticsearch.py index da0c365..724469d 100644 --- a/vendor/pyLibrary/env/elasticsearch.py +++ b/vendor/pyLibrary/env/elasticsearch.py @@ -29,7 +29,7 @@ from mo_logs.strings import utf82unicode, unicode2utf8 from mo_math import Math from mo_math.randoms import Random from mo_threads import Lock, ThreadedQueue, Till -from mo_times import Date, Timer +from mo_times import Date, Timer, MINUTE from pyLibrary import convert from pyLibrary.env import http @@ -38,6 +38,8 @@ ES_NUMERIC_TYPES = ["long", "integer", "double", "float"] ES_PRIMITIVE_TYPES = ["string", "boolean", "integer", "date", "long", "double"] INDEX_DATE_FORMAT = "%Y%m%d_%H%M%S" +STALE_METADATA = 10 * MINUTE + DATA_KEY = text_type("data") @@ -85,7 +87,7 @@ class Index(Features): self.cluster = cluster or Cluster(kwargs) try: - full_index = self.get_index(index) + full_index = self.cluster.get_canonical_index(index) if full_index and alias==None: kwargs.alias = kwargs.index kwargs.index = full_index @@ -93,41 +95,40 @@ class Index(Features): Log.error("not allowed") if type == None: # NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT? - index_ = self.cluster.get_metadata().indices[self.settings.index] - if not index_: - Log.error("can not find index {{index}}", index=self.settings.index) + about = self.cluster.get_metadata().indices[self.settings.index] + type = self.settings.type = _get_best_type_from_mapping(about.mappings)[0] + if type == "_default_": + Log.error("not allowed") + if not type: + Log.error("not allowed") - candidate_types = list(index_.mappings.keys()) - if len(candidate_types) != 1: - Log.error("Expecting `type` parameter") - self.settings.type = type = candidate_types[0] + self.path = "/" + full_index + "/" + type except Exception as e: # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER Log.error("not expected", cause=e) - if not type: - Log.error("not allowed") - - self.path = "/" + full_index + "/" + type - if self.debug: Log.alert("elasticsearch debugging for {{url}} is on", url=self.url) - if tjson: - from pyLibrary.env.typed_inserter import TypedInserter - - self.encode = TypedInserter(self, id_column).typed_encode + props = self.get_properties() + if not props: + tjson = coalesce(kwargs.tjson, True) # TYPED JSON IS DEFAULT + elif props[EXISTS_TYPE]: + if tjson is False: + Log.error("expecting tjson parameter to match properties of {{index}}", index=index) + elif tjson == None: + tjson = kwargs.tjson = True else: - if tjson == None and not read_only: - props = self.get_properties() - if props[EXISTS_TYPE]: - kwargs.tjson=True - from pyLibrary.env.typed_inserter import TypedInserter - self.encode = TypedInserter(self, id_column).typed_encode - else: - kwargs.tjson = False - Log.warning("{{index}} is not typed tjson={{tjson}}", index=self.settings.index, tjson=self.settings.tjson) - self.encode = get_encoder(id_column) + if tjson is True: + Log.error("expecting tjson parameter to match properties of {{index}}", index=index) + elif tjson == None: + tjson = kwargs.tjson = False + + if not read_only: + if tjson: + from pyLibrary.env.typed_inserter import TypedInserter + + self.encode = TypedInserter(self, id_column).typed_encode else: self.encode = get_encoder(id_column) @@ -145,12 +146,12 @@ class Index(Features): self.cluster.info = None return self.get_properties(retry=False) - if not index.mappings[self.settings.type]: + if not index.mappings[self.settings.type] and (index.mappings.keys()-{"_default_"}): Log.warning( "ElasticSearch index {{index|quote}} does not have type {{type|quote}} in {{metadata|json}}", index=self.settings.index, type=self.settings.type, - metadata=jx.sort(metadata.indices.keys()) + metadata=jx.sort(index.mappings.keys()) ) return Null return index.mappings[self.settings.type].properties @@ -195,35 +196,12 @@ class Index(Features): # WAIT FOR ALIAS TO APPEAR while True: - response = self.cluster.get("/_cluster/state", retry={"times": 5}, timeout=3) - if alias in response.metadata.indices[self.settings.index].aliases: + metadata = self.cluster.get_metadata(force=True) + if alias in metadata.indices[self.settings.index].aliases: return Log.note("Waiting for alias {{alias}} to appear", alias=alias) Till(seconds=1).wait() - - - def get_index(self, alias): - """ - RETURN THE INDEX USED BY THIS alias - """ - alias_list = self.cluster.get_aliases() - output = jx.sort(set([ - a.index - for a in alias_list - if a.alias == alias or - a.index == alias or - (re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias) - ])) - - if len(output) > 1: - Log.error("only one index with given alias==\"{{alias}}\" expected", alias= alias) - - if not output: - return Null - - return output.last() - def is_proto(self, index): """ RETURN True IF THIS INDEX HAS NOT BEEN ASSIGNED ITS ALIAS @@ -306,8 +284,6 @@ class Index(Features): else: raise NotImplementedError - - def extend(self, records): """ records - MUST HAVE FORM OF @@ -407,6 +383,22 @@ class Index(Features): Log.error("add() has changed to only accept one record, no lists") self.extend([record]) + def add_property(self, name, details): + if self.debug: + Log.note("Adding property {{prop}} to {{index}}", prop=name, index=self.settings.index) + for n in jx.reverse(split_field(name)): + if n == NESTED_TYPE: + details = {"properties": {n: set_default(details, {"type": "nested", "dynamic": True})}} + elif n.startswith(TYPE_PREFIX): + details = {"properties": {n: details}} + else: + details = {"properties": {n: set_default(details, {"type": "object", "dynamic": True})}} + + self.cluster.put( + "/" + self.settings.index + "/_mapping/" + self.settings.type, + data=details + ) + def refresh(self): self.cluster.post("/" + self.settings.index + "/_refresh") @@ -436,7 +428,7 @@ class Index(Features): elif self.cluster.version.startswith(("1.4.", "1.5.", "1.6.", "1.7.", "5.", "6.")): result = self.cluster.put( "/" + self.settings.index + "/_settings", - data='{"index":{"refresh_interval":' + value2json(interval) + '}}', + data={"index": {"refresh_interval": interval}}, **kwargs ) @@ -532,7 +524,7 @@ class Cluster(object): return cluster @override - def __init__(self, host, port=9200, explore_metadata=True, kwargs=None): + def __init__(self, host, port=9200, explore_metadata=True, debug=False, kwargs=None): """ settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) @@ -542,12 +534,13 @@ class Cluster(object): self.settings = kwargs self.info = None - self._metadata = None + self._metadata = Null + self.index_new_since = {} # MAP FROM INDEX NAME TO TIME THE INDEX METADATA HAS CHANGED self.metadata_locker = Lock() - self.debug = kwargs.debug - self.version = None + self.last_metadata = Date.now() + self.debug = debug + self._version = None self.path = kwargs.host + ":" + text_type(kwargs.port) - self.get_metadata() @override def get_or_create_index( @@ -560,7 +553,7 @@ class Cluster(object): tjson=None, kwargs=None ): - best = self._get_best(kwargs) + best = self.get_best_matching_index(index, alias) if not best: output = self.create_index(kwargs=kwargs, schema=schema, limit_replicas=limit_replicas) return output @@ -573,39 +566,29 @@ class Cluster(object): index = kwargs.index meta = self.get_metadata() - columns = parse_properties(index, ".", meta.indices[index].mappings.values()[0].properties) + type, about = _get_best_type_from_mapping(meta.indices[index].mappings) - tjson = kwargs.tjson - if len(columns) != 0: - kwargs.tjson = tjson or any( - c.names["."].startswith(TYPE_PREFIX) or - c.names["."].find("." + TYPE_PREFIX) != -1 - for c in columns - ) - if tjson is None and not kwargs.tjson: - Log.warning("Not typed index, columns are:\n{{columns|json}}", columns=columns) + if tjson == None: + tjson = True + columns = parse_properties(index, ".", about.properties) + if len(columns) > 0: + tjson = any( + c.names["."].startswith(TYPE_PREFIX) or + c.names["."].find("." + TYPE_PREFIX) != -1 + for c in columns + ) + kwargs.tjson = tjson return Index(kwargs=kwargs, cluster=self) - def _get_best(self, settings): - aliases = self.get_aliases() - indexes = jx.sort([ - a - for a in aliases - if (a.alias == settings.index and settings.alias == None) or - (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or - (a.index == settings.index and (settings.alias == None or a.alias == None or a.alias == settings.alias)) - ], "index") - return indexes.last() - @override - def get_index(self, index, type=None, alias=None, tjson=None, read_only=True, kwargs=None): + def get_index(self, index, type, alias=None, tjson=None, read_only=True, kwargs=None): """ TESTS THAT THE INDEX EXISTS BEFORE RETURNING A HANDLE """ if read_only: # GET EXACT MATCH, OR ALIAS - aliases = self.get_aliases() + aliases = wrap(self.get_aliases()) if index in aliases.index: pass elif index in aliases.alias: @@ -617,7 +600,7 @@ class Cluster(object): return Index(kwargs=kwargs, cluster=self) else: # GET BEST MATCH, INCLUDING PROTOTYPE - best = self._get_best(kwargs) + best = self.get_best_matching_index(index, alias) if not best: Log.error("Can not find index {{index_name}}", index_name=kwargs.index) @@ -643,6 +626,42 @@ class Cluster(object): return Index(read_only=True, kwargs=settings, cluster=self) Log.error("Can not find any index with alias {{alias_name}}", alias_name= alias) + def get_canonical_index(self, alias): + """ + RETURN THE INDEX USED BY THIS alias + THIS IS ACCORDING TO THE STRICT LIFECYCLE RULES: + THERE IS ONLY ONE INDEX WITH AN ALIAS + """ + output = jx.sort(set( + i + for ai in self.get_aliases() + for a, i in [(ai.alias, ai.index)] + if a == alias or i == alias or (re.match(re.escape(alias) + "\\d{8}_\\d{6}", i) and i != alias) + )) + + if len(output) > 1: + Log.error("only one index with given alias==\"{{alias}}\" expected", alias=alias) + + if not output: + return Null + + return output.last() + + def get_best_matching_index(self, index, alias=None): + indexes = jx.sort( + [ + ai_pair + for pattern in [re.escape(index) + r'\d{8}_\d{6}'] + for ai_pair in self.get_aliases() + for a, i in [(ai_pair.alias, ai_pair.index)] + if (a == index and alias == None) or + (re.match(pattern, i) and alias == None) or + (i == index and (alias == None or a == None or a == alias)) + ], + "index" + ) + return indexes.last() + def get_prototype(self, alias): """ RETURN ALL INDEXES THAT ARE INTENDED TO BE GIVEN alias, BUT HAVE NO @@ -698,11 +717,13 @@ class Cluster(object): Log.error("Expecting a JSON schema") for k, m in list(schema.mappings.items()): + m.date_detection = False # DISABLE DATE DETECTION + if tjson: - schema.mappings[k] = add_typed_annotations(m) + m = schema.mappings[k] = wrap(add_typed_annotations(m)) m = wrap(schema.mappings[k]) - schema.mappings[k].date_detection = False # DISABLE DATE DETECTION + m.date_detection = False # DISABLE DATE DETECTION m.dynamic_templates = ( DEFAULT_DYNAMIC_TEMPLATES + m.dynamic_templates @@ -737,11 +758,10 @@ class Cluster(object): ) # CONFIRM INDEX EXISTS - while True: + while not Till(seconds=30): try: - state = self.get("/_cluster/state", retry={"times": 5}, timeout=3, stream=False) - if index in state.metadata.indices: - self._metadata = None + metadata = self.get_metadata(force=True) + if index in metadata.indices: break Log.note("Waiting for index {{index}} to appear", index=index) except Exception as e: @@ -784,37 +804,50 @@ class Cluster(object): RETURN LIST OF {"alias":a, "index":i} PAIRS ALL INDEXES INCLUDED, EVEN IF NO ALIAS {"alias":Null} """ - data = self.get("/_aliases", retry={"times": 5}, timeout=3, stream=False) - output = [] - for index, desc in data.items(): + for index, desc in self.get_metadata().indices.items(): if not desc["aliases"]: - output.append({"index": index, "alias": None}) + yield wrap({"index": index}) + elif desc['aliases'][0] == index: + Log.error("should not happen") else: for a in desc["aliases"]: - output.append({"index": index, "alias": a}) - return wrap(output) + yield wrap({"index": index, "alias": a}) def get_metadata(self, force=False): if not self.settings.explore_metadata: Log.error("Metadata exploration has been disabled") - - if not self._metadata or force: - response = self.get("/_cluster/state", retry={"times": 3}, timeout=30, stream=False) - with self.metadata_locker: - self._metadata = wrap(response.metadata) - # REPLICATE MAPPING OVER ALL ALIASES - indices = self._metadata.indices - for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}): - m.index = i - for a in m.aliases: - if not indices[a]: - indices[a] = m - self.info = wrap(self.get("/", stream=False)) - self.version = self.info.version.number + if not force and self._metadata and Date.now() < self.last_metadata + STALE_METADATA: return self._metadata + old_indices = self._metadata.indices + response = self.get("/_cluster/state", retry={"times": 3}, timeout=30, stream=False) + now = self.last_metadata = Date.now() + with self.metadata_locker: + self._metadata = wrap(response.metadata) + for new_index_name, new_meta in self._metadata.indices.items(): + old_index = old_indices[new_index_name] + if not old_index: + self.index_new_since[new_index_name] = now + else: + for type_name, new_about in new_meta.mappings.items(): + old_about = old_index.mappings[type_name] + diff = diff_schema(new_about.properties, old_about.properties) + if diff: + self.index_new_since[new_index_name] = now + for old_index_name, old_meta in old_indices.items(): + new_index = self._metadata.indices[old_index_name] + if not new_index: + self.index_new_since[old_index_name] = now + self.info = wrap(self.get("/", stream=False)) + self._version = self.info.version.number return self._metadata + @property + def version(self): + if self._version is None: + self.get_metadata() + return self._version + def post(self, path, **kwargs): url = self.settings.host + ":" + text_type(self.settings.port) + path @@ -841,7 +874,7 @@ class Cluster(object): Log.note("POST {{url}}", url=url) response = http.post(url, **kwargs) if response.status_code not in [200, 201]: - Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000)) + Log.error(text_type(response.reason) + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000)) if self.debug: Log.note("response: {{response}}", response=utf82unicode(response.content)[:130]) details = json2value(utf82unicode(response.content)) @@ -1058,16 +1091,7 @@ class Alias(Features): mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index] # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE) - max_prop = -1 - for _type, mapping in mappings.mappings.items(): - if _type == "_default_": - continue - num_prop = len(mapping.properties.keys()) - if max_prop < num_prop: - max_prop = num_prop - self.settings.type = _type - type = _type - + type, props = _get_best_type_from_mapping(mappings.mappings) if type == None: Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index)) @@ -1077,7 +1101,7 @@ class Alias(Features): def url(self): return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/") - def get_schema(self, retry=True): + def get_snowflake(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: @@ -1186,6 +1210,7 @@ class Alias(Features): cause=e ) + def parse_properties(parent_index_name, parent_name, esProperties): """ RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT @@ -1195,8 +1220,6 @@ def parse_properties(parent_index_name, parent_name, esProperties): index_name = parent_index_name column_name = concat_field(parent_name, name) jx_name = column_name - if split_field(column_name)[-1] == EXISTS_TYPE: - property.type = "exists" if property.type == "nested" and property.properties: # NESTED TYPE IS A NEW TYPE DEFINITION @@ -1209,7 +1232,7 @@ def parse_properties(parent_index_name, parent_name, esProperties): es_index=index_name, es_column=column_name, names={".": jx_name}, - type="nested", + es_type="nested", nested_path=ROOT_PATH )) @@ -1223,7 +1246,7 @@ def parse_properties(parent_index_name, parent_name, esProperties): es_index=index_name, es_column=column_name, nested_path=ROOT_PATH, - type="source" if property.enabled == False else "object" + es_type="source" if property.enabled == False else "object" )) if property.dynamic: @@ -1240,7 +1263,7 @@ def parse_properties(parent_index_name, parent_name, esProperties): es_column=column_name, names={".": jx_name}, nested_path=ROOT_PATH, - type=property.type + es_type=property.type )) if property.index_name and name != property.index_name: columns.append(Column( @@ -1248,7 +1271,7 @@ def parse_properties(parent_index_name, parent_name, esProperties): es_column=column_name, names={".": jx_name}, nested_path=ROOT_PATH, - type=property.type + es_type=property.type )) elif property.enabled == None or property.enabled == False: columns.append(Column( @@ -1256,7 +1279,7 @@ def parse_properties(parent_index_name, parent_name, esProperties): es_column=column_name, names={".": jx_name}, nested_path=ROOT_PATH, - type="source" if property.enabled == False else "object" + es_type="source" if property.enabled == False else "object" )) else: Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path) @@ -1264,6 +1287,25 @@ def parse_properties(parent_index_name, parent_name, esProperties): return columns +def _get_best_type_from_mapping(mapping): + """ + THERE ARE MULTIPLE TYPES IN AN INDEX, PICK THE BEST + :param mapping: THE ES MAPPING DOCUMENT + :return: (type_name, mapping) PAIR (mapping.properties WILL HAVE PROPERTIES + """ + best_type_name = None + best_mapping = None + for k, m in mapping.items(): + if k == "_default_": + continue + if best_type_name is None or len(m.properties) > len(best_mapping.properties): + best_type_name = k + best_mapping = m + if best_type_name == None: + return "_default_", mapping["_default_"] + return best_type_name, best_mapping + + def get_encoder(id_expression="_id"): get_id = jx_expression_to_function(id_expression) @@ -1404,18 +1446,44 @@ def add_typed_annotations(meta): else: output = {} for meta_name, meta_value in meta.items(): - if meta_name=='properties': - output[meta_name]={ + if meta_name == 'properties': + output[meta_name] = { prop_name: add_typed_annotations(about) if prop_name not in [BOOLEAN_TYPE, NUMBER_TYPE, STRING_TYPE, BOOLEAN_TYPE] else about for prop_name, about in meta_value.items() } output[meta_name][EXISTS_TYPE] = {"type": "long", "store": True} else: - output[meta_name]=meta_value + output[meta_name] = meta_value return output +def diff_schema(A, B): + """ + RETURN PROPERTIES IN A, BUT NOT IN B + :param A: elasticsearch properties + :param B: elasticsearch properties + :return: (name, properties) PAIRS WHERE name IS DOT-DELIMITED PATH + """ + output =[] + def _diff_schema(path, A, B): + for k, av in A.items(): + bv = B[k] + if bv == None: + output.append((concat_field(path, k), av)) + elif av.type == bv.type: + pass # OK + elif (av.type == None and bv.type == 'object') or (av.type == 'object' and bv.type == None): + pass # OK + else: + Log.warning("inconsistent types: {{typeA}} vs {{typeB}}", typeA=av.type, typeB=bv.type) + _diff_schema(concat_field(path, k), av.properties, bv.properties) + + # what to do with conflicts? + _diff_schema(".", A, B) + return output + + DEFAULT_DYNAMIC_TEMPLATES = wrap([ { "default_typed_boolean": { @@ -1446,6 +1514,12 @@ DEFAULT_DYNAMIC_TEMPLATES = wrap([ "mapping": {"type": "nested", "store": True}, "match": NESTED_TYPE } + }, + { + "default_string": { + "mapping": {"type": "keyword", "store": True}, + "match_mapping_type": "string" + } } ]) @@ -1547,4 +1621,3 @@ _merge_type = { "nested": "nested" } } - diff --git a/vendor/pyLibrary/env/flask_wrappers.py b/vendor/pyLibrary/env/flask_wrappers.py index 9fb9dce..fa57b4d 100644 --- a/vendor/pyLibrary/env/flask_wrappers.py +++ b/vendor/pyLibrary/env/flask_wrappers.py @@ -11,6 +11,7 @@ from __future__ import division from __future__ import unicode_literals import flask +from flask import Response from mo_dots import coalesce from mo_future import binary_type @@ -28,10 +29,8 @@ def gzip_wrapper(func, compress_lower_limit=None): if 'gzip' not in accept_encoding.lower(): return response - resp = response.data - if isinstance(resp, binary_type) and len(resp) > compress_lower_limit: - response.headers['Content-Encoding'] = 'gzip' - response.set_data(b''.join(ibytes2icompressed([resp]))) + response.headers['Content-Encoding'] = 'gzip' + response.response = ibytes2icompressed(response.response) return response diff --git a/vendor/pyLibrary/env/http.py b/vendor/pyLibrary/env/http.py index 61c7076..4ee7b6b 100644 --- a/vendor/pyLibrary/env/http.py +++ b/vendor/pyLibrary/env/http.py @@ -31,7 +31,7 @@ from jx_python import jx from mo_dots import Data, coalesce, wrap, set_default, unwrap, Null from mo_future import text_type, PY2 from mo_json import value2json, json2value -from mo_logs import Log +from mo_logs import Log, strings from mo_logs.strings import utf82unicode, unicode2utf8 from mo_logs.exceptions import Except from mo_math import Math @@ -157,7 +157,7 @@ def request(method, url, zip=None, retry=None, **kwargs): try: if DEBUG: - Log.note(u"http {{method}} to {{url}}", method=method, url=url) + Log.note(u"http {{method|upper}} to {{url}}", method=method, url=text_type(url)) request_count += 1 del kwargs['retry'] @@ -221,11 +221,6 @@ def post(url, **kwargs): return HttpResponse(request('post', url, **kwargs)) -def delete(url, **kwargs): - kwargs.setdefault('stream', False) - return HttpResponse(request('delete', url, **kwargs)) - - def post_json(url, **kwargs): """ ASSUME RESPONSE IN IN JSON @@ -238,16 +233,11 @@ def post_json(url, **kwargs): Log.error(u"Expecting `json` parameter") response = post(url, **kwargs) - c = response.content - try: - details = json2value(utf82unicode(c)) - except Exception as e: - Log.error(u"Unexpected return value {{content}}", content=c, cause=e) - + details = json2value(utf82unicode(response.content)) if response.status_code not in [200, 201]: - Log.error(u"Bad response", cause=Except.wrap(details)) - - return details + Log.error(u"Bad response code {{code}}", code=response.status_code, cause=Except.wrap(details)) + else: + return details def put(url, **kwargs): diff --git a/vendor/pyLibrary/env/typed_inserter.py b/vendor/pyLibrary/env/typed_inserter.py index 6f78456..3df931b 100644 --- a/vendor/pyLibrary/env/typed_inserter.py +++ b/vendor/pyLibrary/env/typed_inserter.py @@ -21,7 +21,7 @@ from jx_base import python_type_to_json_type, INTEGER, NUMBER, EXISTS, NESTED, S from jx_python.expressions import jx_expression_to_function from jx_python.meta import Column from mo_dots import Data, FlatList, NullType, unwrap -from mo_future import text_type, binary_type, utf8_json_encoder, long +from mo_future import text_type, binary_type, utf8_json_encoder, long, sort_using_key from mo_json import ESCAPE_DCT, float2json, json2value from mo_json.encoder import problem_serializing, UnicodeBuilder, COMMA, COLON from mo_json.typed_encoder import encode_property, BOOLEAN_TYPE, NESTED_TYPE, EXISTS_TYPE, STRING_TYPE, NUMBER_TYPE @@ -60,7 +60,7 @@ class TypedInserter(object): if es: _schema = Data() for c in parse_properties(es.settings.alias, ".", es.get_properties()): - if c.type not in (OBJECT, NESTED): + if c.es_type not in (OBJECT, NESTED): _schema[c.names["."]] = c self.schema = unwrap(_schema) else: @@ -127,7 +127,7 @@ class TypedInserter(object): try: if isinstance(sub_schema, Column): value_json_type = python_type_to_json_type[value.__class__] - column_json_type = es_type_to_json_type[sub_schema.type] + column_json_type = es_type_to_json_type[sub_schema.es_type] if value_json_type == column_json_type: pass # ok @@ -283,9 +283,6 @@ class TypedInserter(object): append(_buffer, '}') elif _type is NullType: append(_buffer, 'null') - elif hasattr(value, '__json__'): - from mo_logs import Log - Log.error("do not know how to handle") elif hasattr(value, '__data__'): self._typed_encode(value.__data__(), sub_schema, path, net_new_properties, _buffer) elif hasattr(value, '__iter__'): @@ -338,11 +335,11 @@ class TypedInserter(object): sep = COMMA self._typed_encode(v, sub_schema, path, net_new_properties, _buffer) count += 1 - append(_buffer, ']'+COMMA+QUOTED_EXISTS_TYPE+COLON+ + text_type(count)) + append(_buffer, ']' + COMMA + QUOTED_EXISTS_TYPE + COLON + text_type(count)) def _dict2json(self, value, sub_schema, path, net_new_properties, _buffer): prefix = '{' - for k, v in ((kk, value[kk]) for kk in sorted(value.keys())): + for k, v in sort_using_key(value.items(), lambda r: r[0]): if v == None or v == '': continue append(_buffer, prefix) diff --git a/vendor/pyLibrary/queries/__init__.py b/vendor/pyLibrary/queries/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/vendor/pyLibrary/queries/jx_usingMySQL.py b/vendor/pyLibrary/queries/jx_usingMySQL.py deleted file mode 100644 index 919671b..0000000 --- a/vendor/pyLibrary/queries/jx_usingMySQL.py +++ /dev/null @@ -1,458 +0,0 @@ -# encoding: utf-8 -# -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. -# -# Author: Kyle Lahnakoski (kyle@lahnakoski.com) -# -from __future__ import absolute_import -from __future__ import division -from __future__ import unicode_literals - -from collections import Mapping - -import mo_json -from jx_base.expressions import jx_expression -from mo_collections.matrix import Matrix -from mo_dots import coalesce -from mo_dots import wrap, listwrap, unwrap -from mo_dots.lists import FlatList -from mo_future import text_type -from mo_kwargs import override -from mo_logs import Log -from mo_logs.exceptions import suppress_exception -from mo_logs.strings import indent, expand_template -from pyLibrary import convert -from pyLibrary.sql import SQL, SQL_IS_NULL, SQL_AND, SQL_IS_NOT_NULL, SQL_ORDERBY, SQL_LIMIT, sql_iso, sql_list, SQL_TRUE, sql_alias, SQL_OR, SQL_WHERE, SQL_NOT -from pyLibrary.sql.mysql import int_list_packer - - -class MySQL(object): - """ - jx to MySQL DATABASE QUERIES - """ - - @override - def __init__( - self, - host, - port, - username, - password, - debug=False, - schema=None, - preamble=None, - readonly=False, - kwargs=None - ): - from pyLibrary.sql.mysql import MySQL - - self.settings = kwargs - self._db = MySQL(kwargs) - - def __data__(self): - settings = self.settings.copy() - settings.settings = None - return unwrap(settings) - - def query(self, query, stacked=False): - """ - TRANSLATE JSON QUERY EXPRESSION ON SINGLE TABLE TO SQL QUERY - """ - from jx_base.query import QueryOp - - query = QueryOp.wrap(query) - - sql, post = self._subquery(query, isolate=False, stacked=stacked) - query.data = post(sql) - return query.data - - def update(self, query): - self.db.execute(""" - UPDATE {{table_name}} - SET {{assignment}} - {{where}} - """, { - "table_name": query["from"], - "assignment": ",".join(self.db.quote_column(k) + "=" + self.db.quote_value(v) for k, v in query.set), - "where": self._where2sql(query.where) - }) - - def _subquery(self, query, isolate=True, stacked=False): - if isinstance(query, text_type): - return self.db.quote_column(query), None - if query.name: # IT WOULD BE SAFER TO WRAP TABLE REFERENCES IN A TYPED OBJECT (Cube, MAYBE?) - return self.db.quote_column(query.name), None - - if query.edges: - # RETURN A CUBE - sql, post = self._grouped(query, stacked) - else: - select = listwrap(query.select) - if select[0].aggregate != "none": - sql, post = self._aggop(query) - else: - sql, post = self._setop(query) - - if isolate: - return "(\n" + sql + "\n) a\n", post - else: - return sql, post - - def _grouped(self, query, stacked=False): - select = listwrap(query.select) - - # RETURN SINGLE OBJECT WITH AGGREGATES - for s in select: - if s.aggregate not in aggregates: - Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) - - selects = FlatList() - groups = FlatList() - edges = query.edges - for e in edges: - if e.domain.type != "default": - Log.error("domain of type {{type}} not supported, yet", type=e.domain.type) - groups.append(e.value) - selects.append(sql_alias(e.value, self.db.quote_column(e.name))) - - for s in select: - selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value), self.db.quote_column(s.name))) - - sql = expand_template(""" - SELECT - {{selects}} - FROM - {{table}} - {{where}} - GROUP BY - {{groups}} - """, { - "selects": SQL(",\n".join(selects)), - "groups": SQL(",\n".join(groups)), - "table": self._subquery(query["from"])[0], - "where": self._where2sql(query.where) - }) - - def post_stacked(sql): - # RETURN IN THE USUAL DATABASE RESULT SET FORMAT - return self.db.query(sql) - - def post(sql): - # FIND OUT THE default DOMAIN SIZES - result = self.db.column_query(sql) - num_edges = len(edges) - for e, edge in enumerate(edges): - domain = edge.domain - if domain.type == "default": - domain.type = "set" - parts = set(result[e]) - domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)] - domain.map = {p: i for i, p in enumerate(parts)} - else: - Log.error("Do not know what to do here, yet") - - # FILL THE DATA CUBE - maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)] - cubes = FlatList() - for c, s in enumerate(select): - data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges]) - for rownum, value in enumerate(result[c + num_edges]): - coord = [m[r[rownum]] for m, r in maps] - data[coord] = value - cubes.append(data) - - if isinstance(query.select, list): - return cubes - else: - return cubes[0] - - return sql, post if not stacked else post_stacked - - def _aggop(self, query): - """ - SINGLE ROW RETURNED WITH AGGREGATES - """ - if isinstance(query.select, list): - # RETURN SINGLE OBJECT WITH AGGREGATES - for s in query.select: - if s.aggregate not in aggregates: - Log.error("Expecting all columns to have an aggregate: {{select}}", select=s) - - selects = FlatList() - for s in query.select: - selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value),self.db.quote_column(s.name))) - - sql = expand_template(""" - SELECT - {{selects}} - FROM - {{table}} - {{where}} - """, { - "selects": SQL(",\n".join(selects)), - "table": self._subquery(query["from"])[0], - "where": self._where2sql(query.filter) - }) - - return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES - else: - # RETURN SINGLE VALUE - s0 = query.select - if s0.aggregate not in aggregates: - Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0) - - select = sql_alias(aggregates[s0.aggregate].replace("{{code}}", s0.value) , self.db.quote_column(s0.name)) - - sql = expand_template(""" - SELECT - {{selects}} - FROM - {{table}} - {{where}} - """, { - "selects": SQL(select), - "table": self._subquery(query["from"])[0], - "where": self._where2sql(query.where) - }) - - def post(sql): - result = self.db.column_query(sql) - return result[0][0] - - return sql, post # RETURN SINGLE VALUE - - def _setop(self, query): - """ - NO AGGREGATION, SIMPLE LIST COMPREHENSION - """ - if isinstance(query.select, list): - # RETURN BORING RESULT SET - selects = FlatList() - for s in listwrap(query.select): - if isinstance(s.value, Mapping): - for k, v in s.value.items: - selects.append(sql_alias(v, self.db.quote_column(s.name + "." + k))) - if isinstance(s.value, list): - for i, ss in enumerate(s.value): - selects.append(sql_alias(s.value, self.db.quote_column(s.name + "," + str(i)))) - else: - selects.append(sql_alias(s.value, self.db.quote_column(s.name))) - - sql = expand_template(""" - SELECT - {{selects}} - FROM - {{table}} - {{where}} - {{sort}} - {{limit}} - """, { - "selects": SQL(",\n".join(selects)), - "table": self._subquery(query["from"])[0], - "where": self._where2sql(query.where), - "limit": self._limit2sql(query.limit), - "sort": self._sort2sql(query.sort) - }) - - def post_process(sql): - result = self.db.query(sql) - for s in listwrap(query.select): - if isinstance(s.value, Mapping): - for r in result: - r[s.name] = {} - for k, v in s.value: - r[s.name][k] = r[s.name + "." + k] - r[s.name + "." + k] = None - - if isinstance(s.value, list): - # REWRITE AS TUPLE - for r in result: - r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value)) - for i, ss in enumerate(s.value): - r[s.name + "," + str(i)] = None - - expand_json(result) - return result - - return sql, post_process # RETURN BORING RESULT SET - else: - # RETURN LIST OF VALUES - if query.select.value == ".": - select = "*" - else: - name = query.select.name - select = sql_alias(query.select.value, self.db.quote_column(name)) - - sql = expand_template(""" - SELECT - {{selects}} - FROM - {{table}} - {{where}} - {{sort}} - {{limit}} - """, { - "selects": SQL(select), - "table": self._subquery(query["from"])[0], - "where": self._where2sql(query.where), - "limit": self._limit2sql(query.limit), - "sort": self._sort2sql(query.sort) - }) - - if query.select.value == ".": - def post(sql): - result = self.db.query(sql) - expand_json(result) - return result - - return sql, post - else: - return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES - - def _sort2sql(self, sort): - """ - RETURN ORDER BY CLAUSE - """ - if not sort: - return "" - return SQL_ORDERBY + sql_list([self.db.quote_column(o.field) + (" DESC" if o.sort == -1 else "") for o in sort]) - - def _limit2sql(self, limit): - return SQL("" if not limit else SQL_LIMIT + str(limit)) - - def _where2sql(self, where): - if where == None: - return "" - return SQL_WHERE + _esfilter2sqlwhere(self.db, where) - - -def esfilter2sqlwhere(db, esfilter): - return _esfilter2sqlwhere(db, esfilter) - - -def _esfilter2sqlwhere(db, esfilter): - """ - CONVERT ElassticSearch FILTER TO SQL FILTER - db - REQUIRED TO PROPERLY QUOTE VALUES AND COLUMN NAMES - """ - esfilter = wrap(esfilter) - - if esfilter is True: - return SQL_TRUE - elif esfilter["and"]: - return sql_iso(SQL_AND.join([esfilter2sqlwhere(db, a) for a in esfilter["and"]])) - elif esfilter["or"]: - return sql_iso(SQL_OR.join([esfilter2sqlwhere(db, a) for a in esfilter["or"]])) - elif esfilter["not"]: - return SQL_NOT + sql_iso(esfilter2sqlwhere(db, esfilter["not"])) - elif esfilter.term: - return sql_iso(SQL_AND.join([ - db.quote_column(col) + SQL("=") + db.quote_value(val) - for col, val in esfilter.term.items() - ])) - elif esfilter.terms: - for col, v in esfilter.terms.items(): - if len(v) == 0: - return "FALSE" - - with suppress_exception: - int_list = convert.value2intlist(v) - has_null = False - for vv in v: - if vv == None: - has_null = True - break - if int_list: - filter = int_list_packer(col, int_list) - if has_null: - return esfilter2sqlwhere(db, {"or": [{"missing": col}, filter]}) - else: - return esfilter2sqlwhere(db, filter) - else: - if has_null: - return esfilter2sqlwhere(db, {"missing": col}) - else: - return "false" - return db.quote_column(col) + " in " + sql_iso(sql_list([db.quote_value(val) for val in v])) - elif esfilter.script: - return sql_iso(esfilter.script) - elif esfilter.range: - name2sign = { - "gt": SQL(">"), - "gte": SQL(">="), - "lte": SQL("<="), - "lt": SQL("<") - } - - def single(col, r): - min = coalesce(r["gte"], r[">="]) - max = coalesce(r["lte"], r["<="]) - if min != None and max != None: - # SPECIAL CASE (BETWEEN) - sql = db.quote_column(col) + SQL(" BETWEEN ") + db.quote_value(min) + SQL_AND + db.quote_value(max) - else: - sql = SQL_AND.join( - db.quote_column(col) + name2sign[sign] + db.quote_value(value) - for sign, value in r.items() - ) - return sql - - output = sql_iso(SQL_AND.join([single(col, ranges) for col, ranges in esfilter.range.items()])) - return output - elif esfilter.missing: - if isinstance(esfilter.missing, text_type): - return sql_iso(db.quote_column(esfilter.missing) + SQL_IS_NULL) - else: - return sql_iso(db.quote_column(esfilter.missing.field) + SQL_IS_NULL) - elif esfilter.exists: - if isinstance(esfilter.exists, text_type): - return sql_iso(db.quote_column(esfilter.exists) + SQL_IS_NOT_NULL) - else: - return sql_iso(db.quote_column(esfilter.exists.field) + SQL_IS_NOT_NULL) - elif esfilter.match_all: - return SQL_TRUE - elif esfilter.instr: - return sql_iso(SQL_AND.join(["instr" + sql_iso(db.quote_column(col) + ", " + db.quote_value(val)) + ">0" for col, val in esfilter.instr.items()])) - else: - Log.error("Can not convert esfilter to SQL: {{esfilter}}", esfilter=esfilter) - - -def expand_json(rows): - # CONVERT JSON TO VALUES - for r in rows: - for k, json in list(r.items()): - if isinstance(json, text_type) and json[0:1] in ("[", "{"): - with suppress_exception: - value = mo_json.json2value(json) - r[k] = value - - -# MAP NAME TO SQL FUNCTION -aggregates = { - "one": "COUNT({{code}})", - "sum": "SUM({{code}})", - "add": "SUM({{code}})", - "count": "COUNT({{code}})", - "maximum": "MAX({{code}})", - "minimum": "MIN({{code}})", - "max": "MAX({{code}})", - "min": "MIN({{code}})", - "mean": "AVG({{code}})", - "average": "AVG({{code}})", - "avg": "AVG({{code}})", - "N": "COUNT({{code}})", - "X0": "COUNT({{code}})", - "X1": "SUM({{code}})", - "X2": "SUM(POWER({{code}}, 2))", - "std": "STDDEV({{code}})", - "stddev": "STDDEV({{code}})", - "var": "POWER(STDDEV({{code}}), 2)", - "variance": "POWER(STDDEV({{code}}), 2)" -} - -from jx_base.container import type2container - -type2container["mysql"] = MySQL diff --git a/vendor/pyLibrary/sql/mysql.py b/vendor/pyLibrary/sql/mysql.py index bc6aafb..034da1f 100644 --- a/vendor/pyLibrary/sql/mysql.py +++ b/vendor/pyLibrary/sql/mysql.py @@ -16,13 +16,10 @@ import subprocess from collections import Mapping from datetime import datetime -from pymysql import connect, InterfaceError, cursors - import mo_json from jx_python import jx from mo_dots import coalesce, wrap, listwrap, unwrap from mo_files import File -from mo_future import text_type, utf8_json_encoder, binary_type from mo_kwargs import override from mo_logs import Log from mo_logs.exceptions import Except, suppress_exception @@ -31,7 +28,10 @@ from mo_logs.strings import indent from mo_logs.strings import outdent from mo_math import Math from mo_times import Date -from pyLibrary.sql import SQL, SQL_NULL, SQL_SELECT, SQL_LIMIT, SQL_WHERE, SQL_LEFT_JOIN, SQL_COMMA, SQL_FROM, SQL_AND, sql_list, sql_iso, SQL_ASC, SQL_TRUE, SQL_ONE, SQL_DESC, SQL_IS_NULL, sql_alias +from pymysql import connect, InterfaceError, cursors + +from mo_future import text_type, utf8_json_encoder +from pyLibrary.sql import SQL, SQL_NULL, SQL_SELECT, SQL_LIMIT, SQL_WHERE, SQL_LEFT_JOIN, SQL_FROM, SQL_AND, sql_list, sql_iso, SQL_ASC, SQL_TRUE, SQL_ONE, SQL_DESC, SQL_IS_NULL, sql_alias from pyLibrary.sql.sqlite import join_column DEBUG = False diff --git a/vendor/pyLibrary/sql/sqlite.py b/vendor/pyLibrary/sql/sqlite.py index f82e634..d00ea3b 100644 --- a/vendor/pyLibrary/sql/sqlite.py +++ b/vendor/pyLibrary/sql/sqlite.py @@ -17,6 +17,8 @@ import re import sys from collections import Mapping +from mo_kwargs import override + from mo_future import allocate_lock as _allocate_lock, text_type, zip_longest from mo_dots import Data, coalesce from mo_files import File @@ -48,9 +50,9 @@ def _upgrade(): global sqlite3 try: - Log.note("sqlite not upgraded ") + Log.note("sqlite not upgraded") # return - # + # # import sys # import platform # if "windows" in platform.system().lower(): @@ -59,7 +61,7 @@ def _upgrade(): # source_dll = File("vendor/pyLibrary/vendor/sqlite/sqlite3_32.dll") # else: # source_dll = File("vendor/pyLibrary/vendor/sqlite/sqlite3_64.dll") - # + # # if not all(a == b for a, b in zip_longest(source_dll.read_bytes(), original_dll.read_bytes())): # original_dll.backup() # File.copy(source_dll, original_dll) @@ -81,7 +83,8 @@ class Sqlite(DB): canonical = None - def __init__(self, filename=None, db=None, upgrade=True): + @override + def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread-safe database @@ -89,6 +92,7 @@ class Sqlite(DB): if upgrade and not _upgraded: _upgrade() + self.settings = kwargs self.filename = File(filename).abspath self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS @@ -96,6 +100,8 @@ class Sqlite(DB): self.get_trace = TRACE self.upgrade = upgrade self.closed = False + if DEBUG: + Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0]) def _enhancements(self): def regex(pattern, value): @@ -196,28 +202,16 @@ class Sqlite(DB): try: if DEBUG: Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version) - if Sqlite.canonical: - self.db = Sqlite.canonical - else: - self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread = False) + try: + if Sqlite.canonical: + self.db = Sqlite.canonical + else: + self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread=False) + except Exception as e: + Log.error("could not open file {{filename}}", filename=self.filename) - library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") - full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath - try: - trace = extract_stack(0)[0] - if self.upgrade: - if os.name == 'nt': - file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") - else: - file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") - - full_path = file.abspath - self.db.enable_load_extension(True) - self.db.execute(SQL_SELECT + "load_extension" + sql_iso(self.quote_value(full_path))) - except Exception as e: - if not _load_extension_warning_sent: - _load_extension_warning_sent = True - Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e) + if self.settings.load_functions: + self._load_functions() while not please_stop: quad = self.queue.pop(till=please_stop) @@ -283,11 +277,25 @@ class Sqlite(DB): Log.note("Database is closed") self.db.close() - def quote_column(self, column_name, table=None): - return quote_column(column_name, table) + def _load_functions(self): + global _load_extension_warning_sent + library_loc = File.new_instance(sys.modules[__name__].__file__, "../..") + full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath + try: + trace = extract_stack(0)[0] + if self.upgrade: + if os.name == 'nt': + file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so") + else: + file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions") - def quote_value(self, value): - return quote_value(value) + full_path = file.abspath + self.db.enable_load_extension(True) + self.db.execute(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path))) + except Exception as e: + if not _load_extension_warning_sent: + _load_extension_warning_sent = True + Log.warning("Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e) def create_new_functions(self): @@ -297,6 +305,7 @@ class Sqlite(DB): self.db.create_function("REGEXP", 2, regexp) + _no_need_to_quote = re.compile(r"^\w+$", re.UNICODE)