This commit is contained in:
Kyle Lahnakoski 2018-05-09 12:30:44 -04:00
Родитель 2fc0169c0f
Коммит 210c0883f9
70 изменённых файлов: 2864 добавлений и 2504 удалений

32
vendor/jx_base/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,32 @@
## Some help for the programmer
Some nomenclature is required to help follow the logic of these modules
### Table
Same as with database terminology; it is a single, unordered, set of rows;
### Schema
A set of columns that describe all the (possibly optional) properties available on all rows of a table.
### Facts
Represents the multiple tables in the hierarchical database
### Snowflake
JSON Query Expressions are used the query hierarchical databases. The relations in a hierarchical database are limited to a tree; the path between any two tables is unique; in a query, no matter which table is "origin", any column in the hierarchical database can be accessed using a unique combination of joins with the origin.
With this in mind, a Snowflake is a list of all columns, for all the tables, in the hierarchical database.
### Container
Datastore that has multiple facts
### Namespace
Metadata for a container: Information on multiple snowflakes.

26
vendor/jx_base/__init__.py поставляемый
Просмотреть файл

@ -14,14 +14,11 @@ from __future__ import unicode_literals
from collections import Mapping
from uuid import uuid4
from mo_json import value2json
from mo_logs.strings import expand_template, quote
from mo_logs import Log
from mo_dots import NullType, Data, FlatList, wrap, coalesce, listwrap
from mo_future import text_type, none_type, PY2
from mo_json import value2json
from mo_logs import Log
from mo_logs.strings import expand_template, quote
from mo_times import Date
IS_NULL = '0'
@ -39,7 +36,7 @@ STRUCT = [EXISTS, OBJECT, NESTED]
python_type_to_json_type = {
int: INTEGER,
int: NUMBER,
text_type: STRING,
float: NUMBER,
None: OBJECT,
@ -223,7 +220,7 @@ class {{class_name}}(Mapping):
return _exec(code, name)
class Table(DataClass(
class TableDesc(DataClass(
"Table",
[
"name",
@ -241,6 +238,7 @@ class Table(DataClass(
# return singlton.get_columns(table_name=self.name)
Column = DataClass(
"Column",
[
@ -248,8 +246,8 @@ Column = DataClass(
"names", # MAP FROM TABLE NAME TO COLUMN NAME (ONE COLUMN CAN HAVE MULTIPLE NAMES)
"es_column",
"es_index",
# "es_type",
"type",
"es_type",
{"name": "jx_type", "nulls": True},
{"name": "useSource", "default": False},
{"name": "nested_path", "nulls": True}, # AN ARRAY OF PATHS (FROM DEEPEST TO SHALLOWEST) INDICATING THE JSON SUB-ARRAYS
{"name": "count", "nulls": True},
@ -262,3 +260,11 @@ Column = DataClass(
{"eq": [{"last": "nested_path"}, {"literal": "."}]}
]}
)
from jx_base.container import Container
from jx_base.namespace import Namespace
from jx_base.facts import Facts
from jx_base.snowflake import Snowflake
from jx_base.table import Table
from jx_base.schema import Schema

14
vendor/jx_base/container.py поставляемый
Просмотреть файл

@ -47,7 +47,9 @@ def _delayed_imports():
class Container(object):
"""
Containers are data storage capable of handing queries on that storage
CONTAINERS HOLD MULTIPLE FACTS AND CAN HANDLE
GENERAL JSON QUERY EXPRESSIONS ON ITS CONTENTS
METADATA FOR A Container IS CALL A Namespace
"""
__slots__ = ["data", "namespaces"]
@ -95,16 +97,6 @@ class Container(object):
else:
Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
def __init__(self, frum, schema=None):
object.__init__(self)
if not type2container:
_delayed_imports()
self.data = frum
if isinstance(schema, list):
Log.error("expecting map from es_column to column object")
def query(self, query):
if query.frum != self:
Log.error("not expected")

125
vendor/jx_base/expressions.py поставляемый
Просмотреть файл

@ -11,7 +11,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import itertools
import operator
from collections import Mapping
from decimal import Decimal
@ -20,7 +19,7 @@ import mo_json
from jx_base import OBJECT, python_type_to_json_type, BOOLEAN, NUMBER, INTEGER, STRING, IS_NULL
from jx_base.queries import is_variable_name, get_property_name
from mo_dots import coalesce, wrap, Null, split_field
from mo_future import text_type, utf8_json_encoder, get_function_name
from mo_future import text_type, utf8_json_encoder, get_function_name, zip_longest
from mo_json import scrub
from mo_logs import Log, Except
from mo_math import Math, MAX, MIN, UNION
@ -63,7 +62,7 @@ def jx_expression(expr, schema=None):
if len(leaves) == 0:
v.data_type = IS_NULL
if len(leaves) == 1:
v.data_type = list(leaves)[0].type
v.data_type = list(leaves)[0].jx_type
return output
@ -74,7 +73,9 @@ def _jx_expression(expr):
if isinstance(expr, Expression):
Log.error("Expecting JSON, not expression")
if expr in (True, False, None) or expr == None or isinstance(expr, (float, int, Decimal, Date)):
if expr is None:
return TRUE
elif expr in (True, False, None) or expr == None or isinstance(expr, (float, int, Decimal, Date)):
return Literal(None, expr)
elif isinstance(expr, text_type):
return Variable(expr)
@ -262,16 +263,17 @@ class Variable(Expression):
return {self}
def map(self, map_):
if not isinstance(map_, Mapping):
Log.error("Expecting Mapping")
return Variable(coalesce(map_.get(self.var), self.var))
def __hash__(self):
return self.var.__hash__()
def __eq__(self, other):
return self.var.__eq__(other)
if isinstance(other, Variable):
return self.var == other.var
elif isinstance(other, text_type):
return self.var == other
return False
def __unicode__(self):
return self.var
@ -419,12 +421,13 @@ class ScriptOp(Expression):
ONLY FOR WHEN YOU TRUST THE SCRIPT SOURCE
"""
def __init__(self, op, script):
def __init__(self, op, script, data_type=OBJECT):
Expression.__init__(self, op, None)
if not isinstance(script, text_type):
Log.error("expecting text of a script")
self.simplified = True
self.script = script
self.data_type = data_type
@classmethod
def define(cls, expr):
@ -498,15 +501,8 @@ class Literal(Expression):
elif self.term == None:
return False
Log.warning("expensive")
from mo_testing.fuzzytestcase import assertAlmostEqual
try:
assertAlmostEqual(self.term, other)
return True
except Exception:
return False
if isinstance(other, Literal):
return (self.term == other.term) or (self.json == other.json)
def __data__(self):
return {"literal": self.value}
@ -553,6 +549,7 @@ class Literal(Expression):
def partial_eval(self):
return self
ZERO = Literal("literal", 0)
ONE = Literal("literal", 1)
class NullOp(Literal):
@ -721,7 +718,10 @@ class DateOp(Literal):
def __init__(self, op, term):
if hasattr(self, "date"):
return
self.date = term
if isinstance(term, text_type):
self.date = term
else:
self.date = coalesce(term.literal, term)
v = unicode2Date(self.date)
if isinstance(v, Date):
Literal.__init__(self, op, v.unix)
@ -928,7 +928,11 @@ class FloorOp(Expression):
def __init__(self, op, terms, default=NULL):
Expression.__init__(self, op, terms)
self.lhs, self.rhs = terms
if len(terms) == 1:
self.lhs = terms[0]
self.rhs = ONE
else:
self.lhs, self.rhs = terms
self.default = default
def __data__(self):
@ -984,6 +988,11 @@ class EqOp(Expression):
else:
return {"eq": [self.lhs.__data__(), self.rhs.__data__()]}
def __eq__(self, other):
if isinstance(other, EqOp):
return self.lhs == other.lhs and self.rhs == other.rhs
return False
def vars(self):
return self.lhs.vars() | self.rhs.vars()
@ -1135,6 +1144,11 @@ class AndOp(Expression):
def __data__(self):
return {"and": [t.__data__() for t in self.terms]}
def __eq__(self, other):
if isinstance(other, AndOp):
return all(a == b for a, b in zip_longest(self.terms, other.terms))
return False
def vars(self):
output = set()
for t in self.terms:
@ -1149,53 +1163,46 @@ class AndOp(Expression):
@simplified
def partial_eval(self):
terms = []
ors = []
for t in self.terms:
or_terms = [[]] # LIST OF TUPLES FOR or-ing and and-ing
for i, t in enumerate(self.terms):
simple = BooleanOp("boolean", t).partial_eval()
if simple is TRUE:
pass
continue
elif simple is FALSE:
return FALSE
elif isinstance(simple, AndOp):
terms.extend([tt for tt in simple.terms if tt not in terms])
for and_terms in or_terms:
and_terms.extend([tt for tt in simple.terms if tt not in and_terms])
continue
elif isinstance(simple, OrOp):
ors.append(simple.terms)
or_terms = [
and_terms + [o]
for o in simple.terms
for and_terms in or_terms
]
continue
elif simple.type != BOOLEAN:
Log.error("expecting boolean value")
elif NotOp("not", simple).partial_eval() in terms:
return FALSE
elif simple not in terms:
terms.append(simple)
if len(ors) == 0:
if len(terms) == 0:
for and_terms in list(or_terms):
if NotOp("not", simple).partial_eval() in and_terms:
or_terms.remove(and_terms)
elif simple not in and_terms:
and_terms.append(simple)
if len(or_terms) == 1:
and_terms = or_terms[0]
if len(and_terms) == 0:
return TRUE
if len(terms) == 1:
return terms[0]
output = AndOp("and", terms)
return output
elif len(ors) == 1: # SOME SIMPLE COMMON FACTORING
if len(terms) == 0:
return OrOp("or", ors[0])
elif len(terms) == 1 and terms[0] in ors[0]:
return terms[0]
elif len(and_terms) == 1:
return and_terms[0]
else:
agg_terms = []
for combo in ors[0]:
agg_terms.append(
AndOp("and", [combo]+terms).partial_eval()
)
return OrOp("or", agg_terms).partial_eval()
elif len(terms) == 0:
return OrOp("or", ors[0])
agg_terms = []
for combo in itertools.product(*ors):
agg_terms.append(
AndOp("and", list(combo)+terms).partial_eval()
)
return OrOp("or", agg_terms)
return AndOp("and", and_terms)
return OrOp("or", [
AndOp("and", and_terms) if len(and_terms) > 1 else and_terms[0]
for and_terms in or_terms
])
class OrOp(Expression):
data_type = BOOLEAN
@ -2390,9 +2397,9 @@ class SplitOp(Expression):
)
def missing(self):
v = self.value.to_ruby(not_null=True)
find = self.find.to_ruby(not_null=True)
index = v + ".indexOf(" + find + ", " + self.start.to_ruby() + ")"
v = self.value.to_es_script(not_null=True)
find = self.find.to_es_script(not_null=True)
index = v + ".indexOf(" + find + ", " + self.start.to_es_script() + ")"
return AndOp("and", [
self.default.missing(),

27
vendor/jx_base/facts.py поставляемый Normal file
Просмотреть файл

@ -0,0 +1,27 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http:# mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
class Facts(object):
"""
REPRESENT A HIERARCHICAL DATASTORE: MULTIPLE TABLES IN A DATABASE ALONG
WITH THE RELATIONS THAT CONNECT THEM ALL, BUT LIMITED TO A TREE
"""
def __init__(self, container, snowflake):
self.container = container
self.snowflake = snowflake
@property
def namespace(self):
return self.container.namespace

69
vendor/jx_base/namespace.py поставляемый Normal file
Просмотреть файл

@ -0,0 +1,69 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from collections import Mapping
from jx_base.query import QueryOp
class Namespace(object):
"""
A CONGLOMERATION OF Snowflake METADATA
A Namespace HOLDS METADATA FOR A Collection
"""
def get_snowflake(self, fact_table):
raise NotImplementedError()
def get_schema(self, name):
raise NotImplementedError()
def convert(self, expr):
raise NotImplementedError()
def _convert_query(self, query):
output = QueryOp("from", None)
output.select = self._convert_clause(query.select)
output.where = self.convert(query.where)
output["from"] = self._convert_from(query["from"])
output.edges = self._convert_clause(query.edges)
output.having = convert_list(self._convert_having, query.having)
output.window = convert_list(self._convert_window, query.window)
output.sort = self._convert_clause(query.sort)
output.format = query.format
return output
def _convert_from(self, frum):
raise NotImplementedError()
def _convert_clause(self, clause):
raise NotImplementedError()
def _convert_having(self, clause):
raise NotImplementedError()
def _convert_window(self, clause):
raise NotImplementedError()
def convert_list(operator, operand):
if operand==None:
return None
elif isinstance(operand, Mapping):
return operator(operand)
else:
return map(operator, operand)

2
vendor/jx_base/queries.py поставляемый
Просмотреть файл

@ -15,7 +15,7 @@ from mo_future import text_type
from mo_logs import Log
keyword_pattern = re.compile(r"(\w|[\\.,$])+(?:\.(\w|[\\.,$])+)*")
keyword_pattern = re.compile(r"(\w|[\\.,$-])+(?:\.(\w|[\\.,$-])+)*")
def is_variable_name(value):

31
vendor/jx_base/query.py поставляемый
Просмотреть файл

@ -14,23 +14,20 @@ from __future__ import unicode_literals
from collections import Mapping
from copy import copy
from mo_future import text_type
import jx_base
from jx_base import STRUCT
from jx_base.container import Container
from jx_base.dimensions import Dimension
from jx_base.domains import Domain, SetDomain, DefaultDomain
from jx_base.expressions import jx_expression, Expression, Variable, LeavesOp, ScriptOp, OffsetOp, TRUE, FALSE
from jx_base.queries import is_variable_name
from jx_base.schema import Schema
from mo_dots import Data, relative_field, concat_field
from mo_dots import coalesce, Null, set_default, unwraplist, literal_field
from mo_dots import wrap, unwrap, listwrap
from mo_dots.lists import FlatList
from mo_future import text_type
from mo_json.typed_encoder import untype_path
from mo_logs import Log
from mo_math import AND, UNION
from mo_math import Math
from mo_math import AND, UNION, Math
DEFAULT_LIMIT = 10
MAX_LIMIT = 10000
@ -62,7 +59,7 @@ class QueryOp(Expression):
# return output
def __init__(self, op, frum, select=None, edges=None, groupby=None, window=None, where=None, sort=None, limit=None, format=None):
if isinstance(frum, Container):
if isinstance(frum, jx_base.Table):
pass
else:
Expression.__init__(self, op, frum)
@ -206,7 +203,7 @@ class QueryOp(Expression):
return FALSE
@staticmethod
def wrap(query, table, schema):
def wrap(query, container, namespace):
"""
NORMALIZE QUERY SO IT CAN STILL BE JSON
"""
@ -214,10 +211,14 @@ class QueryOp(Expression):
return query
query = wrap(query)
output = QueryOp("from", table)
output.format = query.format
output.limit = Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT))
table = container.get_table(query['from'])
schema = table.schema
output = QueryOp(
op="from",
frum=table,
format=query.format,
limit=Math.min(MAX_LIMIT, coalesce(query.limit, DEFAULT_LIMIT))
)
if query.select or isinstance(query.select, (Mapping, list)):
output.select = _normalize_selects(query.select, query.frum, schema=schema)
@ -361,7 +362,7 @@ def _normalize_select(select, frum, schema=None):
canonical
)
for c in frum.get_columns()
if c.type not in STRUCT
if c.jx_type not in STRUCT
])
else:
Log.error("do not know what to do")
@ -773,9 +774,11 @@ def _normalize_sort(sort=None):
output.append({"value": s, "sort": 1})
elif Math.is_integer(s):
output.append({"value": OffsetOp("offset", s), "sort": 1})
elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value:
elif not s.sort and not s.value and all(d in sort_direction for d in s.values()):
for v, d in s.items():
output.append({"value": jx_expression(v), "sort": sort_direction[d]})
elif not s.sort and not s.value:
Log.error("`sort` clause must have a `value` property")
else:
output.append({"value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1)})
return output

187
vendor/jx_base/schema.py поставляемый
Просмотреть файл

@ -13,12 +13,99 @@ from __future__ import unicode_literals
from copy import copy
from jx_base import STRUCT, NESTED, PRIMITIVE, OBJECT, EXISTS
from mo_dots import join_field, split_field, Null, startswith_field, set_default, wrap
from mo_json.typed_encoder import unnest_path, untype_path, NESTED_TYPE
from jx_base import STRUCT, NESTED, OBJECT, EXISTS
from mo_dots import Null, startswith_field, set_default, wrap
from mo_json.typed_encoder import unnest_path, untype_path
from mo_logs import Log
class Schema(object):
"""
A Schema MAPS COLUMN NAMES OF A SINGLE TABLE TO COLUMN INSTANCES THAT MATCH
"""
def __init__(self, table_name, columns):
"""
:param table_name: A FULL NAME FOR THIS TABLE (NOT USED)
:param columns: ALL COLUMNS IN SNOWFLAKE
"""
self._columns = copy(columns)
self.table = table_name
self.query_path = "."
self.lookup, self.lookup_leaves, self.lookup_variables = _indexer(columns, self.query_path)
def __getitem__(self, column_name):
cs = self.lookup.get(column_name)
if cs:
return list(cs)
else:
return [wrap({"es_column": column_name})]
def items(self):
return self.lookup.items()
def get_column(self, name, table=None):
return self.lookup[name]
@property
def columns(self):
return self._columns
def get_column_name(self, column):
"""
RETURN THE COLUMN NAME, FROM THE PERSPECTIVE OF THIS SCHEMA
:param column:
:return: NAME OF column
"""
return column.names[self.query_path]
def values(self, name):
"""
RETURN VALUES FOR THE GIVEN PATH NAME
:param name:
:return:
"""
return list(self.lookup_variables.get(unnest_path(name), Null))
def leaves(self, name):
"""
RETURN LEAVES OF GIVEN PATH NAME
pull leaves, considering query_path and namespace
pull all first-level properties
pull leaves, including parent leaves
pull the head of any tree by name
:param name:
:return:
"""
return list(self.lookup_leaves.get(unnest_path(name), Null))
def map_to_es(self):
"""
RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME
"""
full_name = self.query_path
return set_default(
{
c.names[full_name]: c.es_column
for k, cs in self.lookup.items()
# if startswith_field(k, full_name)
for c in cs if c.jx_type not in STRUCT
},
{
c.names["."]: c.es_column
for k, cs in self.lookup.items()
# if startswith_field(k, full_name)
for c in cs if c.jx_type not in STRUCT
}
)
@property
def columns(self):
return copy(self._columns)
def _indexer(columns, query_path):
all_names = set(unnest_path(n) for c in columns for n in c.names.values()) | {"."}
@ -29,7 +116,7 @@ def _indexer(columns, query_path):
nfp = unnest_path(cname)
if (
startswith_field(nfp, full_name) and
c.type not in [EXISTS, OBJECT, NESTED] and
c.es_type not in [EXISTS, OBJECT, NESTED] and
(c.es_column != "_id" or full_name == "_id")
):
cs = lookup_leaves.setdefault(full_name, set())
@ -44,7 +131,7 @@ def _indexer(columns, query_path):
nfp = unnest_path(cname)
if (
startswith_field(nfp, full_name) and
c.type not in [EXISTS, OBJECT] and
c.es_type not in [EXISTS, OBJECT] and
(c.es_column != "_id" or full_name == "_id") and
startswith_field(c.nested_path[0], query_path)
):
@ -81,93 +168,3 @@ def _indexer(columns, query_path):
return relative_lookup, lookup_leaves, lookup_variables
class Schema(object):
"""
A Schema MAPS ALL COLUMNS IN SNOWFLAKE FROM NAME TO COLUMN INSTANCE
"""
def __init__(self, table_name, columns):
"""
:param table_name: THE FACT TABLE
:param query_path: PATH TO ARM OF SNOWFLAKE
:param columns: ALL COLUMNS IN SNOWFLAKE
"""
self._columns = copy(columns)
table_path = split_field(table_name)
self.table = join_field(table_path[:1]) # USED AS AN EXPLICIT STATEMENT OF PERSPECTIVE IN THE DATABASE
query_path = join_field(table_path[1:]) # TODO: REPLACE WITH THE nested_path ARRAY
if query_path == ".":
self.query_path = query_path
else:
query_path += "."+NESTED_TYPE
self.query_path = [c for c in columns if c.type == NESTED and c.names["."] == query_path][0].es_column
self.lookup, self.lookup_leaves, self.lookup_variables = _indexer(columns, self.query_path)
def __getitem__(self, column_name):
cs = self.lookup.get(column_name)
if cs:
return list(cs)
else:
return [wrap({"es_column": column_name})]
def items(self):
return self.lookup.items()
def get_column(self, name, table=None):
return self.lookup[name]
def get_column_name(self, column):
"""
RETURN THE COLUMN NAME, FROM THE PERSPECTIVE OF THIS SCHEMA
:param column:
:return: NAME OF column
"""
return column.names[self.query_path]
def values(self, name):
"""
RETURN VALUES FOR THE GIVEN PATH NAME
:param name:
:return:
"""
return list(self.lookup_variables.get(unnest_path(name), Null))
def leaves(self, name, meta=False):
"""
RETURN LEAVES OF GIVEN PATH NAME
pull leaves, considering query_path and namespace
pull all first-level properties
pull leaves, including parent leaves
pull the head of any tree by name
:param name:
:return:
"""
return list(self.lookup_leaves.get(unnest_path(name), Null))
def map_to_es(self):
"""
RETURN A MAP FROM THE NAME SPACE TO THE es_column NAME
"""
full_name = self.query_path
return set_default(
{
c.names[full_name]: c.es_column
for k, cs in self.lookup.items()
# if startswith_field(k, full_name)
for c in cs if c.type not in STRUCT
},
{
c.names["."]: c.es_column
for k, cs in self.lookup.items()
# if startswith_field(k, full_name)
for c in cs if c.type not in STRUCT
}
)
@property
def columns(self):
return copy(self._columns)

30
vendor/jx_base/snowflake.py поставляемый Normal file
Просмотреть файл

@ -0,0 +1,30 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http:# mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
class Snowflake(object):
"""
REPRESENT ONE ALIAS, AND ITS NESTED ARRAYS
"""
def get_schema(self, query_path):
raise NotImplemented()
@property
def query_paths(self):
raise NotImplemented()
@property
def columns(self):
raise NotImplemented()

22
vendor/jx_base/table.py поставляемый Normal file
Просмотреть файл

@ -0,0 +1,22 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
class Table(object):
def __init__(self, full_name):
self.name = full_name
def map(self, mapping):
return self

3
vendor/jx_elasticsearch/es09/aggop.py поставляемый
Просмотреть файл

@ -11,11 +11,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from jx_base.expressions import Variable
from jx_base.queries import is_variable_name
from jx_elasticsearch import es09
from jx_elasticsearch.es09.util import aggregates, fix_es_stats, build_es_query
from jx_elasticsearch import post as es_post
from jx_elasticsearch.es52.expressions import Variable
# from jx_elasticsearch.es52.expressions import Variable
from jx_python.containers.cube import Cube
from jx_python.expressions import jx_expression_to_function
from mo_collections.matrix import Matrix

17
vendor/jx_elasticsearch/es09/expressions.py поставляемый
Просмотреть файл

@ -15,6 +15,8 @@ from collections import Mapping
from datetime import datetime
import re
from jx_base.queries import keyword_pattern
from mo_future import text_type
from pyLibrary import convert
from mo_collections import reverse
@ -129,13 +131,13 @@ class _MVEL(object):
list = []
for s in selectList:
if is_deep:
if s.value and isKeyword(s.value):
if s.value and is_variable_name(s.value):
shortForm = self._translate(s.value)
list.append("Value2Pipe(" + shortForm + ")\n")
else:
Log.error("do not know how to handle yet")
else:
if s.value and isKeyword(s.value):
if s.value and is_variable_name(s.value):
list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n")
elif s.value:
shortForm = self._translate(s.value)
@ -490,19 +492,8 @@ def _where(esFilter, _translate):
VAR_CHAR = "abcdefghijklmnopqurstvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_.\""
keyword_pattern = re.compile(r"\.*\w*(?:\.\w+)*")
def isKeyword(value):
"""
RETURN TRUE IF THE value IS JUST A NAME OF A FIELD, A LIST OF FIELDS, (OR A VALUE)
"""
if not value or not isinstance(value, text_type):
Log.error("Expecting a string")
if keyword_pattern.match(value):
return True
return False
def value2MVEL(value):

18
vendor/jx_elasticsearch/es09/util.py поставляемый
Просмотреть файл

@ -13,6 +13,10 @@ from __future__ import unicode_literals
from datetime import datetime
from jx_base.queries import is_variable_name
from mo_logs.strings import quote
from mo_logs import Log, strings
from mo_dots import Data
from mo_dots import coalesce
@ -23,7 +27,7 @@ from mo_math import COUNT
from mo_math import Math
from mo_math import stats
from jx_base import domains
from jx_elasticsearch.es09.expressions import value2MVEL, isKeyword
from jx_elasticsearch.es09.expressions import value2MVEL
from mo_times import durations
@ -68,7 +72,7 @@ def compileTime2Term(edge):
# IS THERE A LIMIT ON THE DOMAIN?
numPartitions = len(edge.domain.partitions)
value = edge.value
if isKeyword(value):
if is_variable_name(value):
value = "doc[\"" + value + "\"].value"
nullTest = compileNullTest(edge)
@ -109,7 +113,7 @@ def compileDuration2Term(edge):
# IS THERE A LIMIT ON THE DOMAIN?
numPartitions = len(edge.domain.partitions)
value = edge.value
if isKeyword(value):
if is_variable_name(value):
value = "doc[\"" + value + "\"].value"
ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO)
@ -141,7 +145,7 @@ def compileNumeric2Term(edge):
numPartitions = len(edge.domain.partitions)
value = edge.value
if isKeyword(value):
if is_variable_name(value):
value = "doc[\"" + value + "\"].value"
if not edge.domain.max:
@ -179,7 +183,7 @@ def compileString2Term(edge):
Log.error("edge script not supported yet")
value = edge.value
if isKeyword(value):
if is_variable_name(value):
value = strings.expand_template("getDocValue({{path}})", {"path": quote(value)})
else:
Log.error("not handled")
@ -202,7 +206,7 @@ def compileNullTest(edge):
# IS THERE A LIMIT ON THE DOMAIN?
value = edge.value
if isKeyword(value):
if is_variable_name(value):
value = "doc[\"" + value + "\"].value"
if not edge.domain.max:
@ -240,7 +244,7 @@ def compileEdges2Term(mvel_compiler, edges, constants):
def temp(term):
return FlatList([edge0.domain.getPartByKey(term)])
if edge0.value and isKeyword(edge0.value):
if edge0.value and is_variable_name(edge0.value):
return Data(
field=edge0.value,
term2parts=temp

54
vendor/jx_elasticsearch/es14/__init__.py поставляемый
Просмотреть файл

@ -19,22 +19,17 @@ from jx_base.dimensions import Dimension
from jx_base.expressions import jx_expression
from jx_base.queries import is_variable_name
from jx_base.query import QueryOp
from jx_base.schema import Schema
from jx_elasticsearch.es14.aggs import es_aggsop, is_aggsop
from jx_elasticsearch.es14.deep import is_deepop, es_deepop
from jx_elasticsearch.es14.setop import is_setop, es_setop
from jx_elasticsearch.es14.util import aggregates
from jx_elasticsearch.meta import FromESMetadata
from jx_elasticsearch.meta import ElasticsearchMetadata, Table
from jx_python import jx
from mo_dots import Data, Null, unwrap
from mo_dots import coalesce, split_field, literal_field, unwraplist, join_field
from mo_dots import wrap, listwrap
from mo_dots.lists import FlatList
from mo_json import scrub
from mo_dots import Data, Null, unwrap, coalesce, split_field, literal_field, unwraplist, join_field, wrap, listwrap, FlatList
from mo_json import scrub, value2json
from mo_json.typed_encoder import TYPE_PREFIX
from mo_kwargs import override
from mo_logs import Log
from mo_logs.exceptions import Except
from pyLibrary import convert
from mo_logs import Log, Except
from pyLibrary.env import elasticsearch, http
@ -45,7 +40,7 @@ class ES14(Container):
def __new__(cls, *args, **kwargs):
if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta":
output = FromESMetadata.__new__(FromESMetadata, *args, **kwargs)
output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs)
output.__init__(*args, **kwargs)
return output
else:
@ -66,36 +61,46 @@ class ES14(Container):
typed=None,
kwargs=None
):
Container.__init__(self, None)
Container.__init__(self)
if not container.config.default:
container.config.default = {
"type": "elasticsearch",
"settings": unwrap(kwargs)
}
self.settings = kwargs
self.name = coalesce(name, alias, index)
self.name = name = coalesce(name, alias, index)
if read_only:
self.es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs)
else:
self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)
self.meta = FromESMetadata(kwargs=kwargs)
self._namespace = ElasticsearchMetadata(kwargs=kwargs)
self.settings.type = self.es.settings.type
self.edges = Data()
self.worker = None
columns = self.meta.get_columns(table_name=coalesce(name, alias, index))
self._schema = Schema(coalesce(name, alias, index), columns)
columns = self._namespace.get_snowflake(self._es.settings.alias).columns # ABSOLUTE COLUMNS
if typed == None:
# SWITCH ON TYPED MODE
self.typed = any(c.es_column.find(".$") != -1 for c in columns)
self.typed = any(c.es_column.find("."+TYPE_PREFIX) != -1 for c in columns)
else:
self.typed = typed
@property
def schema(self):
return self._schema
def snowflake(self):
return self._namespace.get_snowflake(self._es.settings.alias)
@property
def namespace(self):
return self._namespace
def get_table(self, full_name):
return Table(full_name, self)
def get_schema(self, query_path):
return self._namespace.get_schema(query_path)
def __data__(self):
settings = self.settings.copy()
@ -126,13 +131,10 @@ class ES14(Container):
def query(self, _query):
try:
query = QueryOp.wrap(_query, _query.frum, schema=self)
for n in self.namespaces:
query = n.convert(query)
query = QueryOp.wrap(_query, container=self, namespace=self.namespace)
for s in listwrap(query.select):
if not aggregates.get(s.aggregate):
if s.aggregate != None and not aggregates.get(s.aggregate):
Log.error(
"ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
name=s.name,
@ -213,7 +215,7 @@ class ES14(Container):
scripts.append({"doc": v.doc})
else:
v = scrub(v)
scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_ruby(schema).script(schema)})
scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)})
if results.hits.hits:
updates = []
@ -221,7 +223,7 @@ class ES14(Container):
for s in scripts:
updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}})
updates.append(s)
content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8')
content = ("\n".join(value2json(c) for c in updates) + "\n")
response = self.es.cluster.post(
self.es.path + "/_bulk",
data=content,

106
vendor/jx_elasticsearch/es14/aggs.py поставляемый
Просмотреть файл

@ -11,29 +11,26 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from mo_future import text_type
from jx_base import EXISTS
from jx_base.domains import SetDomain
from jx_base.expressions import TupleOp, NULL
from jx_base.query import DEFAULT_LIMIT, MAX_LIMIT
from jx_elasticsearch import post as es_post
from jx_elasticsearch.es14.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder
from jx_elasticsearch.es14.decoders import DimFieldListDecoder
from jx_elasticsearch.es14.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder, DimFieldListDecoder
from jx_elasticsearch.es14.expressions import split_expression_by_depth, AndOp, Variable, NullOp
from jx_elasticsearch.es14.setop import get_pull_stats
from jx_elasticsearch.es14.util import aggregates
from jx_python import jx
from jx_python.expressions import jx_expression_to_function
from mo_dots import listwrap, Data, wrap, literal_field, set_default, coalesce, Null, split_field, FlatList, unwrap, unwraplist
from mo_future import text_type
from mo_json.typed_encoder import encode_property
from mo_logs import Log
from mo_logs.strings import quote
from mo_math import Math, MAX, UNION
from mo_times.timer import Timer
def is_aggsop(es, query):
es.cluster.get_metadata()
if query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate):
return True
return False
@ -60,12 +57,12 @@ def get_decoders_by_depth(query):
edge = edge.copy()
vars_ = edge.value.vars()
for v in vars_:
if not schema.leaves(v, meta=True):
if not schema.leaves(v.var):
Log.error("{{var}} does not exist in schema", var=v)
elif edge.range:
vars_ = edge.range.min.vars() | edge.range.max.vars()
for v in vars_:
if not schema[v]:
if not schema[v.var]:
Log.error("{{var}} does not exist in schema", var=v)
elif edge.domain.dimension:
vars_ = edge.domain.dimension.fields
@ -78,7 +75,7 @@ def get_decoders_by_depth(query):
try:
vars_ |= edge.value.vars()
depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v))
depths = set(len(c.nested_path) - 1 for v in vars_ for c in schema.leaves(v.var))
if -1 in depths:
Log.error(
"Do not know of column {{column}}",
@ -102,10 +99,8 @@ def sort_edges(query, prop):
ordered_edges = []
remaining_edges = getattr(query, prop)
for s in query.sort:
if not isinstance(s.value, Variable):
Log.error("can only sort by terms")
for e in remaining_edges:
if e.value.var == s.value.var:
if e.value == s.value:
if isinstance(e.domain, SetDomain):
pass # ALREADY SORTED?
else:
@ -113,6 +108,9 @@ def sort_edges(query, prop):
ordered_edges.append(e)
remaining_edges.remove(e)
break
else:
Log.error("Can not sort by {{expr}}, can only sort by an existing edge expression", expr=s.value)
ordered_edges.extend(remaining_edges)
return ordered_edges
@ -136,34 +134,38 @@ def es_aggsop(es, frum, query):
new_select["count_"+literal_field(s.value.var)] += [s]
else:
new_select[literal_field(s.value.var)] += [s]
else:
elif s.aggregate:
formula.append(s)
for canonical_name, many in new_select.items():
for s in many:
es_cols = frum.schema.values(s.value.var)
columns = frum.schema.values(s.value.var)
if s.aggregate == "count":
canonical_names = []
for es_col in es_cols:
cn = literal_field(es_col.es_column + "_count")
canonical_names.append(cn)
es_query.aggs[cn].value_count.field = es_col.es_column
if len(es_cols) == 1:
s.pull = jx_expression_to_function(canonical_names[0] + ".value")
for column in columns:
cn = literal_field(column.es_column + "_count")
if column.jx_type == EXISTS:
canonical_names.append(cn + ".doc_count")
es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
else:
canonical_names.append(cn+ ".value")
es_query.aggs[cn].value_count.field = column.es_column
if len(canonical_names) == 1:
s.pull = jx_expression_to_function(canonical_names[0])
else:
s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names]})
s.pull = jx_expression_to_function({"add": canonical_names})
elif s.aggregate == "median":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# ES USES DIFFERENT METHOD FOR PERCENTILES
key = literal_field(canonical_name + " percentile")
es_query.aggs[key].percentiles.field = es_cols[0].es_column
es_query.aggs[key].percentiles.field = columns[0].es_column
es_query.aggs[key].percentiles.percents += [50]
s.pull = jx_expression_to_function(key + ".values.50\\.0")
elif s.aggregate == "percentile":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# ES USES DIFFERENT METHOD FOR PERCENTILES
key = literal_field(canonical_name + " percentile")
@ -171,49 +173,49 @@ def es_aggsop(es, frum, query):
Log.error("Expecting percentile to be a float from 0.0 to 1.0")
percent = Math.round(s.percentile * 100, decimal=6)
es_query.aggs[key].percentiles.field = es_cols[0].es_column
es_query.aggs[key].percentiles.field = columns[0].es_column
es_query.aggs[key].percentiles.percents += [percent]
s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
elif s.aggregate == "cardinality":
canonical_names = []
for es_col in es_cols:
cn = literal_field(es_col.es_column + "_cardinality")
for column in columns:
cn = literal_field(column.es_column + "_cardinality")
canonical_names.append(cn)
es_query.aggs[cn].cardinality.field = es_col.es_column
if len(es_cols) == 1:
es_query.aggs[cn].cardinality.field = column.es_column
if len(columns) == 1:
s.pull = jx_expression_to_function(canonical_names[0] + ".value")
else:
s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
elif s.aggregate == "stats":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# REGULAR STATS
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column
es_query.aggs[stats_name].extended_stats.field = columns[0].es_column
# GET MEDIAN TOO!
median_name = literal_field(canonical_name + "_percentile")
es_query.aggs[median_name].percentiles.field = es_cols[0].es_column
es_query.aggs[median_name].percentiles.field = columns[0].es_column
es_query.aggs[median_name].percentiles.percents += [50]
s.pull = get_pull_stats(stats_name, median_name)
elif s.aggregate == "union":
pulls = []
for es_col in es_cols:
stats_name = encode_property(es_col.es_column)
for column in columns:
stats_name = encode_property(column.es_column)
if es_col.nested_path[0] == ".":
if column.nested_path[0] == ".":
es_query.aggs[stats_name] = {"terms": {
"field": es_col.es_column,
"field": column.es_column,
"size": Math.min(s.limit, MAX_LIMIT)
}}
pulls.append(get_bucket_keys(stats_name))
else:
es_query.aggs[stats_name] = {
"nested": {"path": es_col.nested_path[0]},
"nested": {"path": column.nested_path[0]},
"aggs": {"_nested": {"terms": {
"field": es_col.es_column,
"field": column.es_column,
"size": Math.min(s.limit, MAX_LIMIT)
}}}
}
@ -228,11 +230,11 @@ def es_aggsop(es, frum, query):
for p in pulls
)
else:
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# PULL VALUE OUT OF THE stats AGGREGATE
es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column
es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column
s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})
for i, s in enumerate(formula):
@ -245,13 +247,13 @@ def es_aggsop(es, frum, query):
else:
Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
elif s.aggregate == "count":
es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_ruby(schema).script(schema)
es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
elif s.aggregate == "median":
# ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
key = literal_field(canonical_name + " percentile")
es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[key].percentiles.percents += [50]
s.pull = jx_expression_to_function(key + ".values.50\\.0")
elif s.aggregate == "percentile":
@ -259,35 +261,35 @@ def es_aggsop(es, frum, query):
key = literal_field(canonical_name + " percentile")
percent = Math.round(s.percentile * 100, decimal=6)
es_query.aggs[key].percentiles.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[key].percentiles.percents += [percent]
s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
elif s.aggregate == "cardinality":
# ES USES DIFFERENT METHOD FOR CARDINALITY
key = canonical_name + " cardinality"
es_query.aggs[key].cardinality.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(key + ".value")
elif s.aggregate == "stats":
# REGULAR STATS
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].extended_stats.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema)
# GET MEDIAN TOO!
median_name = literal_field(canonical_name + " percentile")
es_query.aggs[median_name].percentiles.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[median_name].percentiles.percents += [50]
s.pull = get_pull_stats(stats_name, median_name)
elif s.aggregate=="union":
# USE TERMS AGGREGATE TO SIMULATE union
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].terms.script_field = s.value.to_ruby(schema).script(schema)
es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(stats_name + ".buckets.key")
else:
# PULL VALUE OUT OF THE stats AGGREGATE
s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
es_query.aggs[canonical_name].extended_stats.script = s.value.to_ruby(schema).script(schema)
es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema)
decoders = get_decoders_by_depth(query)
start = 0
@ -312,11 +314,7 @@ def es_aggsop(es, frum, query):
es_query = wrap({
"aggs": {"_nested": set_default(
{
"nested": {
"path": schema.query_path
}
},
{"nested": {"path": schema.query_path[0]}},
es_query
)}
})
@ -442,6 +440,8 @@ def aggs_iterator(aggs, decoders, coord=True):
if coord:
for a, parts in _aggs_iterator(unwrap(aggs), depth - 1):
coord = tuple(d.get_index(parts) for d in decoders)
if any(c is None for c in coord):
continue
yield parts, coord, a
else:
for a, parts in _aggs_iterator(unwrap(aggs), depth - 1):

201
vendor/jx_elasticsearch/es14/decoders.py поставляемый
Просмотреть файл

@ -13,20 +13,20 @@ from __future__ import unicode_literals
from collections import Mapping
from mo_future import text_type, binary_type
from jx_base import STRING, NUMBER, BOOLEAN
from jx_base.dimensions import Dimension
from jx_base.domains import SimpleSetDomain, DefaultDomain, PARTITION
from jx_base.expressions import TupleOp
from jx_base.expressions import TupleOp, TRUE
from jx_base.query import MAX_LIMIT, DEFAULT_LIMIT
from jx_elasticsearch.es14.expressions import Variable, NotOp, InOp, Literal, OrOp, AndOp, InequalityOp, LeavesOp
from jx_elasticsearch.es14.expressions import Variable, NotOp, InOp, Literal, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE
from jx_python import jx
from mo_dots import set_default, coalesce, literal_field, Data, relative_field
from mo_dots import wrap
from mo_dots import wrap, set_default, coalesce, literal_field, Data, relative_field, unwraplist
from mo_future import text_type
from mo_json.typed_encoder import untype_path
from mo_logs import Log
from mo_math import MAX, MIN
from mo_math import Math
from mo_logs.strings import quote, expand_template
from mo_math import MAX, MIN, Math
from pyLibrary.convert import string2boolean
class AggsDecoder(object):
@ -37,7 +37,7 @@ class AggsDecoder(object):
# if query.groupby:
# return object.__new__(DefaultDecoder, e)
if isinstance(e.value, (text_type, binary_type)):
if isinstance(e.value, text_type):
Log.error("Expecting Variable or Expression, not plain string")
if isinstance(e.value, LeavesOp):
@ -63,6 +63,9 @@ class AggsDecoder(object):
limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)
if col.partitions != None:
if col.multi > 1 and len(col.partitions) < 6:
return object.__new__(MultivalueDecoder)
partitions = col.partitions[:limit:]
if e.domain.sort==-1:
partitions = list(reversed(sorted(partitions)))
@ -138,18 +141,18 @@ class SetDecoder(AggsDecoder):
def __init__(self, edge, query, limit):
AggsDecoder.__init__(self, edge, query, limit)
domain = self.domain = edge.domain
self.sorted = None
self.pull = pull_functions[STRING]
# WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
# self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)]
edge_var = edge.value.vars()
edge_var = set(v.var for v in edge.value.vars())
if query.sort:
for s in query.sort:
if not edge_var - s.value.vars():
if not edge_var - set(v.var for v in s.value.vars()):
self.sorted = {1: "asc", -1: "desc"}[s.sort]
parts = jx.sort(domain.partitions, {"value": domain.key, "sort": s.sort})
edge.domain = self.domain = SimpleSetDomain(key=domain.key, label=domain.label, partitions=parts)
else:
self.sorted = None
def append_query(self, es_query, start):
self.start = start
@ -180,7 +183,7 @@ class SetDecoder(AggsDecoder):
}}, es_query)
else:
terms = set_default({"terms": {
"script": value.to_ruby(self.schema).script(self.schema),
"script": value.to_es_script(self.schema).script(self.schema),
"size": limit
}}, es_query)
@ -206,7 +209,7 @@ class SetDecoder(AggsDecoder):
return self.domain.getKeyByIndex(index)
def get_value_from_row(self, row):
return row[self.start].get('key')
return self.pull(row[self.start].get('key'))
def get_index(self, row):
try:
@ -242,7 +245,7 @@ def _range_composer(edge, domain, es_query, to_float, schema):
if isinstance(edge.value, Variable):
calc = {"field": schema.leaves(edge.value.var)[0].es_column}
else:
calc = {"script": edge.value.to_ruby(schema).script(schema)}
calc = {"script": edge.value.to_es_script(schema).script(schema)}
return wrap({"aggs": {
"_match": set_default(
@ -446,6 +449,44 @@ class RangeDecoder(AggsDecoder):
return 1
class MultivalueDecoder(SetDecoder):
def __init__(self, edge, query, limit):
AggsDecoder.__init__(self, edge, query, limit)
self.var = edge.value.var
self.values = query.frum.schema[edge.value.var][0].partitions
self.parts = []
def append_query(self, es_query, start):
self.start = start
es_field = self.query.frum.schema.leaves(self.var)[0].es_column
es_query = wrap({"aggs": {
"_match": set_default({"terms": {
"script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'})
}}, es_query)
}})
return es_query
def get_value_from_row(self, row):
values = row[self.start]['key'].replace("||", "\b").split("|")
if len(values) == 2:
return None
return unwraplist([v.replace("\b", "|") for v in values[1:-1]])
def get_index(self, row):
find = self.get_value_from_row(row)
try:
return self.parts.index(find)
except Exception:
self.parts.append(find)
return len(self.parts)-1
@property
def num_columns(self):
return 1
class ObjectDecoder(AggsDecoder):
def __init__(self, edge, query, limit):
AggsDecoder.__init__(self, edge, query, limit)
@ -535,70 +576,67 @@ class DefaultDecoder(SetDecoder):
self.parts = list()
self.key2index = {}
self.computed_domain = False
self.script = self.edge.value.partial_eval().to_es_script(self.schema)
self.pull = pull_functions[self.script.data_type]
self.missing = self.script.miss.partial_eval()
self.exists = NotOp("not", self.missing).partial_eval()
# WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
self.sorted = None
edge_var = edge.value.vars()
for s in query.sort:
if not edge_var - s.value.vars():
self.sorted = {1: "asc", -1: "desc"}[s.sort]
# WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM
sort_candidates = [s for s in self.query.sort if s.value == self.edge.value]
if sort_candidates:
self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]}
else:
self.es_order = None
def append_query(self, es_query, start):
self.start = start
value = self.edge.value.partial_eval()
script = value.to_ruby(self.schema)
exists = NotOp("not", script.miss).partial_eval()
if not isinstance(self.edge.value, Variable):
output = wrap({"aggs": {
"_match": {
"filter": exists.to_esfilter(self.schema),
"aggs": {
"_filter": set_default(
{"terms": {
"script": script.expr,
"size": self.domain.limit,
"order": {"_term": self.sorted} if self.sorted else None
}},
es_query
)
}
},
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
}})
return output
elif self.edge.value.var in [s.value.var for s in self.query.sort]:
sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0]
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"field": self.schema.leaves(self.edge.value.var)[0].es_column,
"size": self.domain.limit,
"order": {"_term": "asc" if sort_dir == 1 else "desc"}
}},
es_query
),
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
}})
if self.exists is TRUE:
# IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH)
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"script": self.script.expr,
"size": self.domain.limit,
"order": self.es_order
}},
es_query
)
}})
else:
output = wrap({"aggs": {
"_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing
"filter": self.exists.to_esfilter(self.schema),
"aggs": {
"_filter": set_default(
{"terms": {
"script": self.script.expr,
"size": self.domain.limit,
"order": self.es_order
}},
es_query
)
}
},
"_missing": set_default(
{"filter": self.missing.to_esfilter(self.schema)},
es_query
)
}})
return output
else:
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"field": self.schema.leaves(self.edge.value.var)[0].es_column,
"size": self.domain.limit
"size": self.domain.limit,
"order": self.es_order
}},
es_query
),
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
{"filter": self.missing.to_esfilter(self.schema)},
es_query
)
}})
@ -608,7 +646,7 @@ class DefaultDecoder(SetDecoder):
part = row[self.start]
if part['doc_count']:
if part.get('key') != None:
self.parts.append(part.get('key'))
self.parts.append(self.pull(part.get('key')))
else:
self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS
@ -623,19 +661,19 @@ class DefaultDecoder(SetDecoder):
if self.computed_domain:
try:
part = row[self.start]
return self.domain.getIndexByKey(part.get('key'))
return self.domain.getIndexByKey(self.pull(part.get('key')))
except Exception as e:
Log.error("problem", cause=e)
else:
try:
part = row[self.start]
key = part.get('key')
key = self.pull(part.get('key'))
i = self.key2index.get(key)
if i is None:
i = len(self.parts)
part = {"key": key, "dataIndex": i}
self.parts.append({"key": key, "dataIndex": i})
self.key2index[i] = part
self.parts.append(part)
self.key2index[key] = i
return i
except Exception as e:
Log.error("problem", cause=e)
@ -648,6 +686,7 @@ class DefaultDecoder(SetDecoder):
class DimFieldListDecoder(SetDecoder):
def __init__(self, edge, query, limit):
AggsDecoder.__init__(self, edge, query, limit)
edge.allowNulls = False
self.fields = edge.domain.dimension.fields
self.domain = self.edge.domain
self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT)
@ -665,11 +704,10 @@ class DimFieldListDecoder(SetDecoder):
"size": self.domain.limit
}}, es_query)}
}}})
if self.edge.allowNulls:
nest.aggs._missing = set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
nest.aggs._missing = set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
es_query = nest
if self.domain.where:
@ -696,11 +734,20 @@ class DimFieldListDecoder(SetDecoder):
)
def get_index(self, row):
find = tuple(p.get("key") for p in row[self.start:self.start + self.num_columns:])
return self.domain.getIndexByKey(find)
part = row[self.start:self.start + len(self.fields):]
if part[0]['doc_count']==0:
return None
find = tuple(p.get("key") for p in part)
output = self.domain.getIndexByKey(find)
return output
@property
def num_columns(self):
return len(self.fields)
pull_functions = {
STRING: lambda x: x,
NUMBER: lambda x: float(x) if x !=None else None,
BOOLEAN: string2boolean
}

33
vendor/jx_elasticsearch/es14/deep.py поставляемый
Просмотреть файл

@ -11,7 +11,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from jx_base import STRUCT, NESTED, EXISTS
from jx_base import NESTED
from jx_base.expressions import NULL
from jx_base.query import DEFAULT_LIMIT
from jx_elasticsearch import post as es_post
@ -49,8 +49,7 @@ def is_deepop(es, query):
def es_deepop(es, query):
schema = query.frum.schema
columns = schema.columns
query_path = schema.query_path
query_path = schema.query_path[0]
# TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
# THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS
@ -97,7 +96,7 @@ def es_deepop(es, query):
col_names = set()
for c in leaves:
if c.nested_path[0] == ".":
if c.type == NESTED:
if c.jx_type == NESTED:
continue
es_query.fields += [c.es_column]
c_name = untype_path(c.names[query_path])
@ -128,7 +127,7 @@ def es_deepop(es, query):
for n in net_columns:
pull = get_pull_function(n)
if n.nested_path[0] == ".":
if n.type == NESTED:
if n.jx_type == NESTED:
continue
es_query.fields += [n.es_column]
@ -155,14 +154,14 @@ def es_deepop(es, query):
else:
expr = s.value
for v in expr.vars():
for c in schema[v]:
for c in schema[v.var]:
if c.nested_path[0] == ".":
es_query.fields += [c.es_column]
# else:
# Log.error("deep field not expected")
pull_name = EXPRESSION_PREFIX + s.name
map_to_local = {untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT}
map_to_local = MapToLocal(schema)
pull = jx_expression_to_function(pull_name)
post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python())
@ -217,3 +216,23 @@ def es_deepop(es, query):
Log.error("problem formatting", e)
class MapToLocal(object):
"""
MAP FROM RELATIVE/ABSOLUTE NAMESPACE TO PYTHON THAT WILL EXTRACT RESULT
"""
def __init__(self, map_to_columns):
self.map_to_columns = map_to_columns
def __getitem__(self, item):
return self.get(item)
def get(self, item):
cs = self.map_to_columns[item]
if len(cs) == 0:
return "Null"
elif len(cs) == 1:
return get_pull(cs[0])
else:
return "coalesce(" + (",".join(get_pull(c) for c in cs)) + ")"

647
vendor/jx_elasticsearch/es14/expressions.py поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

24
vendor/jx_elasticsearch/es14/format.py поставляемый
Просмотреть файл

@ -11,17 +11,15 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from collections import Mapping
from mo_dots import Data, set_default, wrap, split_field, coalesce
from mo_logs import Log
from pyLibrary import convert
from jx_base.expressions import TupleOp
from jx_elasticsearch.es14.aggs import count_dim, aggs_iterator, format_dispatch, drill
from jx_python.containers.cube import Cube
from mo_collections.matrix import Matrix
from mo_dots import Data, set_default, wrap, split_field, coalesce
from mo_future import sort_using_key
from mo_logs import Log
from mo_logs.strings import quote
from pyLibrary import convert
FunctionType = type(lambda: 1)
@ -51,7 +49,7 @@ def format_cube(decoders, aggs, start, query, select):
cube = Cube(
query.select,
sorted(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY
sort_using_key(new_edges, key=lambda e: e.dim), # ENSURE EDGES ARE IN SAME ORDER AS QUERY
{s.name: m for s, m in matricies}
)
cube.frum = query
@ -184,7 +182,7 @@ def format_list_from_groupby(decoders, aggs, start, query, select):
continue
output = Data()
for g, d in zip(query.groupby, decoders):
output[g.put.name] = d.get_value_from_row(row)
output[coalesce(g.put.name, g.name)] = d.get_value_from_row(row)
for s in select:
output[s.name] = s.pull(agg)
@ -210,7 +208,7 @@ def format_list(decoders, aggs, start, query, select):
if query.sort and not query.groupby:
# TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE
all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS
for row, coord, agg in aggs_iterator(aggs, decoders):
for _, coord, agg in aggs_iterator(aggs, decoders):
missing_coord = all_coord.next()
while coord != missing_coord:
# INSERT THE MISSING COORDINATE INTO THE GENERATION
@ -232,7 +230,7 @@ def format_list(decoders, aggs, start, query, select):
output[s.name] = s.pull(agg)
yield output
else:
is_sent = Matrix(dims=dims, zeros=0)
for row, coord, agg in aggs_iterator(aggs, decoders):
is_sent[coord] = 1
@ -286,12 +284,6 @@ def format_list_from_aggop(decoders, aggs, start, query, select):
})
def format_line(decoders, aggs, start, query, select):
list = format_list(decoders, aggs, start, query, select)

27
vendor/jx_elasticsearch/es14/setop.py поставляемый
Просмотреть файл

@ -19,12 +19,11 @@ from jx_base.expressions import IDENTITY
from jx_base.query import DEFAULT_LIMIT
from jx_elasticsearch import post as es_post
from jx_elasticsearch.es14.expressions import Variable, LeavesOp
from jx_elasticsearch.es14.util import jx_sort_to_es_sort, es_query_template
from jx_elasticsearch.es14.util import jx_sort_to_es_sort, es_query_template, es_and, es_or, es_not, es_script
from jx_python.containers.cube import Cube
from jx_python.expressions import jx_expression_to_function
from mo_collections.matrix import Matrix
from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field
from mo_dots import listwrap
from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field, listwrap
from mo_dots.lists import FlatList
from mo_json.typed_encoder import untype_path, unnest_path, untyped
from mo_logs import Log
@ -56,7 +55,7 @@ def is_setop(es, query):
def es_setop(es, query):
schema = query.frum.schema
es_query, filters = es_query_template(schema.query_path)
es_query, filters = es_query_template(schema.query_path[0])
nested_filter = None
set_default(filters[0], query.where.partial_eval().to_esfilter(schema))
es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
@ -78,7 +77,7 @@ def es_setop(es, query):
leaves = schema.leaves(term.var)
for c in leaves:
full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var))
if c.type == NESTED:
if c.jx_type == NESTED:
es_query.fields = ["_source"]
new_select.append({
"name": full_name,
@ -88,7 +87,7 @@ def es_setop(es, query):
})
put_index += 1
elif c.nested_path[0] != ".":
es_query.fields = ["_source"]
pass # THE NESTED PARENT WILL CAPTURE THIS
else:
es_query.fields += [c.es_column]
new_select.append({
@ -103,7 +102,7 @@ def es_setop(es, query):
leaves = schema.leaves(s_column)
nested_selects = {}
if leaves:
if any(c.type == NESTED for c in leaves):
if s_column == '.' or any(c.jx_type == NESTED for c in leaves):
# PULL WHOLE NESTED ARRAYS
es_query.fields = ["_source"]
for c in leaves:
@ -120,7 +119,7 @@ def es_setop(es, query):
for c in leaves:
if len(c.nested_path) == 1:
jx_name = untype_path(c.names["."])
if c.type == NESTED:
if c.jx_type == NESTED:
es_query.fields = ["_source"]
new_select.append({
"name": select.name,
@ -144,7 +143,7 @@ def es_setop(es, query):
filters[0][k] = None
set_default(
filters[0],
{"and": [where, {"or": nested_filter}]}
es_and([where, es_or(nested_filter)])
)
nested_path = c.nested_path[0]
@ -156,7 +155,7 @@ def es_setop(es, query):
where.nested.inner_hits._source = False
where.nested.inner_hits.fields += [c.es_column]
child = relative_field(untype_path(c.names[schema.query_path]), s_column)
child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column)
pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path))))
new_select.append({
"name": select.name,
@ -169,7 +168,7 @@ def es_setop(es, query):
"pull": pull
})
else:
nested_selects[nested_path].nested.inner_hits.fields+=[c.es_column]
nested_selects[nested_path].nested.inner_hits.fields += [c.es_column]
else:
new_select.append({
"name": select.name,
@ -178,9 +177,8 @@ def es_setop(es, query):
})
put_index += 1
else:
painless = select.value.partial_eval().to_ruby(schema)
es_query.script_fields[literal_field(select.name)] = {"script": painless.script(schema)}
painless = select.value.partial_eval().to_es_script(schema)
es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema))
new_select.append({
"name": select.name,
"pull": jx_expression_to_function("fields." + literal_field(select.name)),
@ -345,6 +343,7 @@ set_default(format_dispatch, {
"list": (format_list, None, "application/json")
})
def get_pull(column):
if column.nested_path[0] == ".":
return concat_field("fields", literal_field(column.es_column))

34
vendor/jx_elasticsearch/es14/util.py поставляемый
Просмотреть файл

@ -11,6 +11,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from mo_future import text_type
from mo_logs import Log
from jx_base import STRING, BOOLEAN, NUMBER, OBJECT
from jx_elasticsearch.es14.expressions import Variable
from mo_dots import wrap
@ -23,18 +27,21 @@ def es_query_template(path):
:return:
"""
if not isinstance(path, text_type):
Log.error("expecting path to be a string")
if path != ".":
f0 = {}
f1 = {}
output = wrap({
"query": {"filtered": {"filter": {"and":[
"query": {"filtered": {"filter": es_and([
f0,
{"nested": {
"path": path,
"filter": f1,
"inner_hits": {"size": 100000}
}}
]}}},
])}},
"from": 0,
"size": 0,
"sort": []
@ -43,7 +50,7 @@ def es_query_template(path):
else:
f0 = {}
output = wrap({
"query": {"filtered": {"filter": f0}},
"query": {"filtered": {"filter": es_and([f0])}},
"from": 0,
"size": 0,
"sort": []
@ -66,7 +73,7 @@ def jx_sort_to_es_sort(sort, schema):
for type in types:
for c in cols:
if c.type == type:
if c.jx_type == type:
if s.sort == -1:
output.append({c.es_column: "desc"})
else:
@ -109,3 +116,22 @@ aggregates = {
NON_STATISTICAL_AGGS = {"none", "one"}
def es_and(terms):
return wrap({"and": terms})
def es_or(terms):
return wrap({"or": terms})
def es_not(term):
return wrap({"not": term})
def es_script(term):
return wrap({"script": term})
def es_missing(term):
return {"missing": {"field": term}}

36
vendor/jx_elasticsearch/es52/__init__.py поставляемый
Просмотреть файл

@ -19,12 +19,11 @@ from jx_base.dimensions import Dimension
from jx_base.expressions import jx_expression
from jx_base.queries import is_variable_name
from jx_base.query import QueryOp
from jx_base.schema import Schema
from jx_elasticsearch.es52.aggs import es_aggsop, is_aggsop
from jx_elasticsearch.es52.deep import is_deepop, es_deepop
from jx_elasticsearch.es52.setop import is_setop, es_setop
from jx_elasticsearch.es52.util import aggregates
from jx_elasticsearch.meta import FromESMetadata
from jx_elasticsearch.meta import ElasticsearchMetadata, Table
from jx_python import jx
from mo_dots import Data, Null, unwrap, coalesce, split_field, literal_field, unwraplist, join_field, wrap, listwrap, FlatList
from mo_json import scrub, value2json
@ -41,7 +40,7 @@ class ES52(Container):
def __new__(cls, *args, **kwargs):
if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta":
output = FromESMetadata.__new__(FromESMetadata, *args, **kwargs)
output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs)
output.__init__(*args, **kwargs)
return output
else:
@ -62,26 +61,25 @@ class ES52(Container):
typed=None,
kwargs=None
):
Container.__init__(self, None)
Container.__init__(self)
if not container.config.default:
container.config.default = {
"type": "elasticsearch",
"settings": unwrap(kwargs)
}
self.settings = kwargs
self.name = coalesce(name, alias, index)
self.name = name = coalesce(name, alias, index)
if read_only:
self.es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs)
else:
self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)
self.meta = FromESMetadata(kwargs=kwargs)
self._namespace = ElasticsearchMetadata(kwargs=kwargs)
self.settings.type = self.es.settings.type
self.edges = Data()
self.worker = None
columns = self.meta.get_columns(table_name=coalesce(name, alias, index))
self._schema = Schema(coalesce(name, alias, index), columns)
columns = self._namespace.get_snowflake(self._es.settings.alias).columns # ABSOLUTE COLUMNS
if typed == None:
# SWITCH ON TYPED MODE
@ -90,8 +88,19 @@ class ES52(Container):
self.typed = typed
@property
def schema(self):
return self._schema
def snowflake(self):
return self._namespace.get_snowflake(self._es.settings.alias)
@property
def namespace(self):
return self._namespace
def get_table(self, full_name):
return Table(full_name, self)
def get_schema(self, query_path):
return self._namespace.get_schema(query_path)
def __data__(self):
settings = self.settings.copy()
@ -122,10 +131,7 @@ class ES52(Container):
def query(self, _query):
try:
query = QueryOp.wrap(_query, table=self, schema=self.schema)
for n in self.namespaces:
query = n.convert(query)
query = QueryOp.wrap(_query, container=self, namespace=self.namespace)
for s in listwrap(query.select):
if s.aggregate != None and not aggregates.get(s.aggregate):
@ -209,7 +215,7 @@ class ES52(Container):
scripts.append({"doc": v.doc})
else:
v = scrub(v)
scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_painless(schema).script(schema)})
scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)})
if results.hits.hits:
updates = []

91
vendor/jx_elasticsearch/es52/aggs.py поставляемый
Просмотреть файл

@ -14,7 +14,7 @@ from __future__ import unicode_literals
from jx_base import EXISTS
from jx_base.domains import SetDomain
from jx_base.expressions import TupleOp, NULL
from jx_base.query import DEFAULT_LIMIT
from jx_base.query import DEFAULT_LIMIT, MAX_LIMIT
from jx_elasticsearch import post as es_post
from jx_elasticsearch.es52.decoders import DefaultDecoder, AggsDecoder, ObjectDecoder, DimFieldListDecoder
from jx_elasticsearch.es52.expressions import split_expression_by_depth, AndOp, Variable, NullOp
@ -30,7 +30,6 @@ from mo_logs.strings import quote, expand_template
from mo_math import Math, MAX, UNION
from mo_times.timer import Timer
COMPARE_TUPLE = """
(a, b)->{
int i=0;
@ -79,7 +78,6 @@ MAX_OF_TUPLE = """
def is_aggsop(es, query):
es.cluster.get_metadata()
if query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate):
return True
return False
@ -106,12 +104,12 @@ def get_decoders_by_depth(query):
edge = edge.copy()
vars_ = edge.value.vars()
for v in vars_:
if not schema.leaves(v.var, meta=True):
if not schema.leaves(v.var):
Log.error("{{var}} does not exist in schema", var=v)
elif edge.range:
vars_ = edge.range.min.vars() | edge.range.max.vars()
for v in vars_:
if not schema[v]:
if not schema[v.var]:
Log.error("{{var}} does not exist in schema", var=v)
elif edge.domain.dimension:
vars_ = edge.domain.dimension.fields
@ -148,10 +146,8 @@ def sort_edges(query, prop):
ordered_edges = []
remaining_edges = getattr(query, prop)
for s in query.sort:
if not isinstance(s.value, Variable):
Log.error("can only sort by terms")
for e in remaining_edges:
if e.value.var == s.value.var:
if e.value == s.value:
if isinstance(e.domain, SetDomain):
pass # ALREADY SORTED?
else:
@ -159,6 +155,9 @@ def sort_edges(query, prop):
ordered_edges.append(e)
remaining_edges.remove(e)
break
else:
Log.error("Can not sort by {{expr}}, can only sort by an existing edge expression", expr=s.value)
ordered_edges.extend(remaining_edges)
return ordered_edges
@ -187,33 +186,33 @@ def es_aggsop(es, frum, query):
for canonical_name, many in new_select.items():
for s in many:
es_cols = frum.schema.values(s.value.var)
columns = frum.schema.values(s.value.var)
if s.aggregate == "count":
canonical_names = []
for es_col in es_cols:
cn = literal_field(es_col.es_column + "_count")
if es_col.type == EXISTS:
for column in columns:
cn = literal_field(column.es_column + "_count")
if column.jx_type == EXISTS:
canonical_names.append(cn + ".doc_count")
es_query.aggs[cn].filter.range = {es_col.es_column: {"gt": 0}}
es_query.aggs[cn].filter.range = {column.es_column: {"gt": 0}}
else:
canonical_names.append(cn+ ".value")
es_query.aggs[cn].value_count.field = es_col.es_column
if len(es_cols) == 1:
es_query.aggs[cn].value_count.field = column.es_column
if len(canonical_names) == 1:
s.pull = jx_expression_to_function(canonical_names[0])
else:
s.pull = jx_expression_to_function({"add": canonical_names})
elif s.aggregate == "median":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# ES USES DIFFERENT METHOD FOR PERCENTILES
key = literal_field(canonical_name + " percentile")
es_query.aggs[key].percentiles.field = es_cols[0].es_column
es_query.aggs[key].percentiles.field = columns[0].es_column
es_query.aggs[key].percentiles.percents += [50]
s.pull = jx_expression_to_function(key + ".values.50\\.0")
elif s.aggregate == "percentile":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# ES USES DIFFERENT METHOD FOR PERCENTILES
key = literal_field(canonical_name + " percentile")
@ -221,48 +220,48 @@ def es_aggsop(es, frum, query):
Log.error("Expecting percentile to be a float from 0.0 to 1.0")
percent = Math.round(s.percentile * 100, decimal=6)
es_query.aggs[key].percentiles.field = es_cols[0].es_column
es_query.aggs[key].percentiles.field = columns[0].es_column
es_query.aggs[key].percentiles.percents += [percent]
s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
elif s.aggregate == "cardinality":
canonical_names = []
for es_col in es_cols:
cn = literal_field(es_col.es_column + "_cardinality")
for column in columns:
cn = literal_field(column.es_column + "_cardinality")
canonical_names.append(cn)
es_query.aggs[cn].cardinality.field = es_col.es_column
if len(es_cols) == 1:
es_query.aggs[cn].cardinality.field = column.es_column
if len(columns) == 1:
s.pull = jx_expression_to_function(canonical_names[0] + ".value")
else:
s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
elif s.aggregate == "stats":
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# REGULAR STATS
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column
es_query.aggs[stats_name].extended_stats.field = columns[0].es_column
# GET MEDIAN TOO!
median_name = literal_field(canonical_name + "_percentile")
es_query.aggs[median_name].percentiles.field = es_cols[0].es_column
es_query.aggs[median_name].percentiles.field = columns[0].es_column
es_query.aggs[median_name].percentiles.percents += [50]
s.pull = get_pull_stats(stats_name, median_name)
elif s.aggregate == "union":
pulls = []
for es_col in es_cols:
for column in columns:
script = {"scripted_metric": {
'init_script': 'params._agg.terms = new HashSet()',
'map_script': 'for (v in doc['+quote(es_col.es_column)+'].values) params._agg.terms.add(v)',
'map_script': 'for (v in doc['+quote(column.es_column)+'].values) params._agg.terms.add(v)',
'combine_script': 'return params._agg.terms.toArray()',
'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
}}
stats_name = encode_property(es_col.es_column)
if es_col.nested_path[0] == ".":
stats_name = encode_property(column.es_column)
if column.nested_path[0] == ".":
es_query.aggs[stats_name] = script
pulls.append(jx_expression_to_function(stats_name + ".value"))
else:
es_query.aggs[stats_name] = {
"nested": {"path": es_col.nested_path[0]},
"nested": {"path": column.nested_path[0]},
"aggs": {"_nested": script}
}
pulls.append(jx_expression_to_function(stats_name + "._nested.value"))
@ -274,11 +273,11 @@ def es_aggsop(es, frum, query):
else:
s.pull = lambda row: UNION(p(row) for p in pulls)
else:
if len(es_cols) > 1:
if len(columns) > 1:
Log.error("Do not know how to count columns with more than one type (script probably)")
# PULL VALUE OUT OF THE stats AGGREGATE
es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column
es_query.aggs[literal_field(canonical_name)].extended_stats.field = columns[0].es_column
s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})
for i, s in enumerate(formula):
@ -296,8 +295,8 @@ def es_aggsop(es, frum, query):
dir = -1
op = 'min'
nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_painless(schema).expr
selfy = s.value.partial_eval().to_painless(schema).expr
nully = TupleOp("tuple", [NULL]*len(s.value.terms)).partial_eval().to_es_script(schema).expr
selfy = s.value.partial_eval().to_es_script(schema).expr
script = {"scripted_metric": {
'init_script': 'params._agg.best = ' + nully + ';',
@ -317,13 +316,13 @@ def es_aggsop(es, frum, query):
else:
Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
elif s.aggregate == "count":
es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_painless(schema).script(schema)
es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
elif s.aggregate == "median":
# ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
key = literal_field(canonical_name + " percentile")
es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema)
es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[key].percentiles.percents += [50]
s.pull = jx_expression_to_function(key + ".values.50\\.0")
elif s.aggregate == "percentile":
@ -331,35 +330,35 @@ def es_aggsop(es, frum, query):
key = literal_field(canonical_name + " percentile")
percent = Math.round(s.percentile * 100, decimal=6)
es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema)
es_query.aggs[key].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[key].percentiles.percents += [percent]
s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
elif s.aggregate == "cardinality":
# ES USES DIFFERENT METHOD FOR CARDINALITY
key = canonical_name + " cardinality"
es_query.aggs[key].cardinality.script = s.value.to_painless(schema).script(schema)
es_query.aggs[key].cardinality.script = s.value.to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(key + ".value")
elif s.aggregate == "stats":
# REGULAR STATS
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].extended_stats.script = s.value.to_painless(schema).script(schema)
es_query.aggs[stats_name].extended_stats.script = s.value.to_es_script(schema).script(schema)
# GET MEDIAN TOO!
median_name = literal_field(canonical_name + " percentile")
es_query.aggs[median_name].percentiles.script = s.value.to_painless(schema).script(schema)
es_query.aggs[median_name].percentiles.script = s.value.to_es_script(schema).script(schema)
es_query.aggs[median_name].percentiles.percents += [50]
s.pull = get_pull_stats(stats_name, median_name)
elif s.aggregate == "union":
# USE TERMS AGGREGATE TO SIMULATE union
stats_name = literal_field(canonical_name)
es_query.aggs[stats_name].terms.script_field = s.value.to_painless(schema).script(schema)
es_query.aggs[stats_name].terms.script_field = s.value.to_es_script(schema).script(schema)
s.pull = jx_expression_to_function(stats_name + ".buckets.key")
else:
# PULL VALUE OUT OF THE stats AGGREGATE
s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
es_query.aggs[canonical_name].extended_stats.script = s.value.to_painless(schema).script(schema)
es_query.aggs[canonical_name].extended_stats.script = s.value.to_es_script(schema).script(schema)
decoders = get_decoders_by_depth(query)
start = 0
@ -384,11 +383,7 @@ def es_aggsop(es, frum, query):
es_query = wrap({
"aggs": {"_nested": set_default(
{
"nested": {
"path": schema.query_path
}
},
{"nested": {"path": schema.query_path[0]}},
es_query
)}
})

138
vendor/jx_elasticsearch/es52/decoders.py поставляемый
Просмотреть файл

@ -13,21 +13,21 @@ from __future__ import unicode_literals
from collections import Mapping
from mo_future import text_type, binary_type
from jx_base import STRING, NUMBER, BOOLEAN
from jx_base.dimensions import Dimension
from jx_base.domains import SimpleSetDomain, DefaultDomain, PARTITION
from jx_base.expressions import TupleOp, value2json
from jx_base.expressions import TupleOp, TRUE
from jx_base.query import MAX_LIMIT, DEFAULT_LIMIT
from jx_elasticsearch.es52.expressions import Variable, NotOp, InOp, Literal, OrOp, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE
from jx_elasticsearch.es52.expressions import Variable, NotOp, InOp, Literal, AndOp, InequalityOp, LeavesOp, LIST_TO_PIPE
from jx_elasticsearch.es52.util import es_missing
from jx_python import jx
from mo_dots import set_default, coalesce, literal_field, Data, relative_field, unwraplist
from mo_dots import wrap
from mo_dots import wrap, set_default, coalesce, literal_field, Data, relative_field, unwraplist
from mo_future import text_type
from mo_json.typed_encoder import untype_path
from mo_logs import Log
from mo_logs.strings import quote, expand_template
from mo_math import MAX, MIN
from mo_math import Math
from mo_math import MAX, MIN, Math
from pyLibrary.convert import string2boolean
class AggsDecoder(object):
@ -144,6 +144,7 @@ class SetDecoder(AggsDecoder):
AggsDecoder.__init__(self, edge, query, limit)
domain = self.domain = edge.domain
self.sorted = None
self.pull = pull_functions[STRING]
# WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
# self.sorted = {1: "asc", -1: "desc", None: None}[getattr(edge.domain, 'sort', None)]
@ -186,7 +187,7 @@ class SetDecoder(AggsDecoder):
terms = set_default({"terms": {
"script": {
"lang": "painless",
"inline": value.to_painless(self.schema).script(self.schema)
"inline": value.to_es_script(self.schema).script(self.schema)
},
"size": limit
}}, es_query)
@ -213,7 +214,7 @@ class SetDecoder(AggsDecoder):
return self.domain.getKeyByIndex(index)
def get_value_from_row(self, row):
return row[self.start].get('key')
return self.pull(row[self.start].get('key'))
def get_index(self, row):
try:
@ -249,7 +250,7 @@ def _range_composer(edge, domain, es_query, to_float, schema):
if isinstance(edge.value, Variable):
calc = {"field": schema.leaves(edge.value.var)[0].es_column}
else:
calc = {"script": edge.value.to_painless(schema).script(schema)}
calc = {"script": edge.value.to_es_script(schema).script(schema)}
return wrap({"aggs": {
"_match": set_default(
@ -464,7 +465,7 @@ class MultivalueDecoder(SetDecoder):
self.start = start
es_field = self.query.frum.schema.leaves(self.var)[0].es_column
es_query = wrap({"aggs": {
es_query = wrap({"aggs": {
"_match": set_default({"terms": {
"script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'})
}}, es_query)
@ -521,7 +522,7 @@ class ObjectDecoder(AggsDecoder):
"size": self.domain.limit
}}, es_query),
"_missing": set_default(
{"filter": {"bool": {"must_not": {"exists": {"field": v}}}}},
{"filter": es_missing(v)},
es_query
)
}})
@ -580,73 +581,67 @@ class DefaultDecoder(SetDecoder):
self.parts = list()
self.key2index = {}
self.computed_domain = False
self.script = self.edge.value.partial_eval().to_es_script(self.schema)
self.pull = pull_functions[self.script.data_type]
self.missing = self.script.miss.partial_eval()
self.exists = NotOp("not", self.missing).partial_eval()
# WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
self.sorted = None
edge_var = edge.value.vars()
for s in query.sort:
if not edge_var - s.value.vars():
self.sorted = {1: "asc", -1: "desc"}[s.sort]
# WHEN SORT VALUE AND EDGE VALUE MATCHES, WE SORT BY TERM
sort_candidates = [s for s in self.query.sort if s.value == self.edge.value]
if sort_candidates:
self.es_order = {"_term": {1: "asc", -1: "desc"}[sort_candidates[0].sort]}
else:
self.es_order = None
def append_query(self, es_query, start):
self.start = start
value = self.edge.value.partial_eval()
script = value.to_painless(self.schema)
exists = NotOp("not", script.miss).partial_eval()
if not isinstance(self.edge.value, Variable):
output = wrap({"aggs": {
"_match": {
"filter": exists.to_esfilter(self.schema),
"aggs": {
"_filter": set_default(
{"terms": {
"script": {
"lang": "painless",
"inline": script.expr
},
"size": self.domain.limit,
"order": {"_term": self.sorted} if self.sorted else None
}},
es_query
)
}
},
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
}})
return output
elif self.edge.value.var in [s.value.var for s in self.query.sort]:
sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0]
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"field": self.schema.leaves(self.edge.value.var)[0].es_column,
"size": self.domain.limit,
"order": {"_term": "asc" if sort_dir == 1 else "desc"}
}},
es_query
),
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
es_query
)
}})
if self.exists is TRUE:
# IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH)
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"script": {"lang": "painless", "inline": self.script.expr},
"size": self.domain.limit,
"order": self.es_order
}},
es_query
)
}})
else:
output = wrap({"aggs": {
"_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing
"filter": self.exists.to_esfilter(self.schema),
"aggs": {
"_filter": set_default(
{"terms": {
"script": {"lang": "painless", "inline": self.script.expr},
"size": self.domain.limit,
"order": self.es_order
}},
es_query
)
}
},
"_missing": set_default(
{"filter": self.missing.to_esfilter(self.schema)},
es_query
)
}})
return output
else:
output = wrap({"aggs": {
"_match": set_default(
{"terms": {
"field": self.schema.leaves(self.edge.value.var)[0].es_column,
"size": self.domain.limit
"size": self.domain.limit,
"order": self.es_order
}},
es_query
),
"_missing": set_default(
{"filter": NotOp("not", exists).to_esfilter(self.schema)},
{"filter": self.missing.to_esfilter(self.schema)},
es_query
)
}})
@ -656,7 +651,7 @@ class DefaultDecoder(SetDecoder):
part = row[self.start]
if part['doc_count']:
if part.get('key') != None:
self.parts.append(part.get('key'))
self.parts.append(self.pull(part.get('key')))
else:
self.edge.allowNulls = True # OK! WE WILL ALLOW NULLS
@ -671,19 +666,19 @@ class DefaultDecoder(SetDecoder):
if self.computed_domain:
try:
part = row[self.start]
return self.domain.getIndexByKey(part.get('key'))
return self.domain.getIndexByKey(self.pull(part.get('key')))
except Exception as e:
Log.error("problem", cause=e)
else:
try:
part = row[self.start]
key = part.get('key')
key = self.pull(part.get('key'))
i = self.key2index.get(key)
if i is None:
i = len(self.parts)
part = {"key": key, "dataIndex": i}
self.parts.append({"key": key, "dataIndex": i})
self.key2index[i] = part
self.parts.append(part)
self.key2index[key] = i
return i
except Exception as e:
Log.error("problem", cause=e)
@ -755,3 +750,8 @@ class DimFieldListDecoder(SetDecoder):
return len(self.fields)
pull_functions = {
STRING: lambda x: x,
NUMBER: lambda x: float(x) if x !=None else None,
BOOLEAN: string2boolean
}

35
vendor/jx_elasticsearch/es52/deep.py поставляемый
Просмотреть файл

@ -11,7 +11,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from jx_base import STRUCT, NESTED
from jx_base import NESTED
from jx_base.expressions import NULL
from jx_base.query import DEFAULT_LIMIT
from jx_elasticsearch import post as es_post
@ -49,8 +49,7 @@ def is_deepop(es, query):
def es_deepop(es, query):
schema = query.frum.schema
columns = schema.columns
query_path = schema.query_path
query_path = schema.query_path[0]
# TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
# THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER: ES ALLOWS
@ -68,7 +67,7 @@ def es_deepop(es, query):
if not wheres[1]:
more_filter = {
"bool": {
"must": [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)],
"filter": [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)],
"must_not": {
"nested": {
"path": query_path,
@ -103,7 +102,7 @@ def es_deepop(es, query):
col_names = set()
for c in leaves:
if c.nested_path[0] == ".":
if c.type == NESTED:
if c.jx_type == NESTED:
continue
es_query.stored_fields += [c.es_column]
c_name = untype_path(c.names[query_path])
@ -134,7 +133,7 @@ def es_deepop(es, query):
for n in net_columns:
pull = get_pull_function(n)
if n.nested_path[0] == ".":
if n.type == NESTED:
if n.jx_type == NESTED:
continue
es_query.stored_fields += [n.es_column]
@ -161,14 +160,14 @@ def es_deepop(es, query):
else:
expr = s.value
for v in expr.vars():
for c in schema[v]:
for c in schema[v.var]:
if c.nested_path[0] == ".":
es_query.stored_fields += [c.es_column]
# else:
# Log.error("deep field not expected")
pull_name = EXPRESSION_PREFIX + s.name
map_to_local = {untype_path(k): get_pull(cc) for k, c in schema.lookup.items() for cc in c if cc.type not in STRUCT}
map_to_local = MapToLocal(schema)
pull = jx_expression_to_function(pull_name)
post_expressions[pull_name] = compile_expression(expr.map(map_to_local).to_python())
@ -223,3 +222,23 @@ def es_deepop(es, query):
Log.error("problem formatting", e)
class MapToLocal(object):
"""
MAP FROM RELATIVE/ABSOLUTE NAMESPACE TO PYTHON THAT WILL EXTRACT RESULT
"""
def __init__(self, map_to_columns):
self.map_to_columns = map_to_columns
def __getitem__(self, item):
return self.get(item)
def get(self, item):
cs = self.map_to_columns[item]
if len(cs) == 0:
return "Null"
elif len(cs) == 1:
return get_pull(cs[0])
else:
return "coalesce(" + (",".join(get_pull(c) for c in cs)) + ")"

558
vendor/jx_elasticsearch/es52/expressions.py поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

18
vendor/jx_elasticsearch/es52/format.py поставляемый
Просмотреть файл

@ -11,18 +11,15 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from collections import Mapping
from mo_dots import Data, set_default, wrap, split_field, coalesce
from mo_future import sort_using_key
from mo_logs import Log
from pyLibrary import convert
from jx_base.expressions import TupleOp
from jx_elasticsearch.es52.aggs import count_dim, aggs_iterator, format_dispatch, drill
from jx_python.containers.cube import Cube
from mo_collections.matrix import Matrix
from mo_dots import Data, set_default, wrap, split_field, coalesce
from mo_future import sort_using_key
from mo_logs import Log
from mo_logs.strings import quote
from pyLibrary import convert
FunctionType = type(lambda: 1)
@ -191,6 +188,9 @@ def format_list_from_groupby(decoders, aggs, start, query, select):
output[s.name] = s.pull(agg)
yield output
for g in query.groupby:
g.put.name = coalesce(g.put.name, g.name)
output = Data(
meta={"format": "list"},
data=list(data())
@ -208,7 +208,7 @@ def format_list(decoders, aggs, start, query, select):
if query.sort and not query.groupby:
# TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE
all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS
for row, coord, agg in aggs_iterator(aggs, decoders):
for _, coord, agg in aggs_iterator(aggs, decoders):
missing_coord = all_coord.next()
while coord != missing_coord:
# INSERT THE MISSING COORDINATE INTO THE GENERATION
@ -230,7 +230,7 @@ def format_list(decoders, aggs, start, query, select):
output[s.name] = s.pull(agg)
yield output
else:
is_sent = Matrix(dims=dims, zeros=0)
for row, coord, agg in aggs_iterator(aggs, decoders):
is_sent[coord] = 1

31
vendor/jx_elasticsearch/es52/setop.py поставляемый
Просмотреть файл

@ -19,12 +19,11 @@ from jx_base.expressions import IDENTITY
from jx_base.query import DEFAULT_LIMIT
from jx_elasticsearch import post as es_post
from jx_elasticsearch.es52.expressions import Variable, LeavesOp
from jx_elasticsearch.es52.util import jx_sort_to_es_sort, es_query_template
from jx_elasticsearch.es52.util import jx_sort_to_es_sort, es_query_template, es_and, es_or, es_not, es_script
from jx_python.containers.cube import Cube
from jx_python.expressions import jx_expression_to_function
from mo_collections.matrix import Matrix
from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field
from mo_dots import listwrap
from mo_dots import coalesce, split_field, set_default, Data, unwraplist, literal_field, unwrap, wrap, concat_field, relative_field, join_field, listwrap
from mo_dots.lists import FlatList
from mo_json.typed_encoder import untype_path, unnest_path, untyped
from mo_logs import Log
@ -56,7 +55,7 @@ def is_setop(es, query):
def es_setop(es, query):
schema = query.frum.schema
es_query, filters = es_query_template(schema.query_path)
es_query, filters = es_query_template(schema.query_path[0])
nested_filter = None
set_default(filters[0], query.where.partial_eval().to_esfilter(schema))
es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
@ -75,10 +74,10 @@ def es_setop(es, query):
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
if isinstance(select.value, LeavesOp) and isinstance(select.value.term, Variable):
term = select.value.term
leaves = schema.values(term.var)
leaves = schema.leaves(term.var)
for c in leaves:
full_name = concat_field(select.name, relative_field(untype_path(c.names["."]), term.var))
if c.type == NESTED:
if c.jx_type == NESTED:
es_query.stored_fields = ["_source"]
new_select.append({
"name": full_name,
@ -88,7 +87,7 @@ def es_setop(es, query):
})
put_index += 1
elif c.nested_path[0] != ".":
es_query.stored_fields = ["_source"]
pass # THE NESTED PARENT WILL CAPTURE THIS
else:
es_query.stored_fields += [c.es_column]
new_select.append({
@ -103,7 +102,7 @@ def es_setop(es, query):
leaves = schema.leaves(s_column)
nested_selects = {}
if leaves:
if any(c.type == NESTED for c in leaves):
if s_column == '.' or any(c.jx_type == NESTED for c in leaves):
# PULL WHOLE NESTED ARRAYS
es_query.stored_fields = ["_source"]
for c in leaves:
@ -120,7 +119,7 @@ def es_setop(es, query):
for c in leaves:
if len(c.nested_path) == 1:
jx_name = untype_path(c.names["."])
if c.type == NESTED:
if c.jx_type == NESTED:
es_query.stored_fields = ["_source"]
new_select.append({
"name": select.name,
@ -144,7 +143,7 @@ def es_setop(es, query):
filters[0][k] = None
set_default(
filters[0],
{"bool": {"must": [where, {"bool": {"should": nested_filter}}]}}
es_and([where, es_or(nested_filter)])
)
nested_path = c.nested_path[0]
@ -156,7 +155,7 @@ def es_setop(es, query):
where.nested.inner_hits._source = False
where.nested.inner_hits.stored_fields += [c.es_column]
child = relative_field(untype_path(c.names[schema.query_path]), s_column)
child = relative_field(untype_path(c.names[schema.query_path[0]]), s_column)
pull = accumulate_nested_doc(nested_path, Variable(relative_field(s_column, unnest_path(nested_path))))
new_select.append({
"name": select.name,
@ -169,7 +168,7 @@ def es_setop(es, query):
"pull": pull
})
else:
nested_selects[nested_path].nested.inner_hits.stored_fields+=[c.es_column]
nested_selects[nested_path].nested.inner_hits.stored_fields += [c.es_column]
else:
new_select.append({
"name": select.name,
@ -178,11 +177,8 @@ def es_setop(es, query):
})
put_index += 1
else:
painless = select.value.partial_eval().to_painless(schema)
es_query.script_fields[literal_field(select.name)] = {"script": {
"lang": "painless",
"inline": painless.script(schema)
}}
painless = select.value.partial_eval().to_es_script(schema)
es_query.script_fields[literal_field(select.name)] = es_script(painless.script(schema))
new_select.append({
"name": select.name,
"pull": jx_expression_to_function("fields." + literal_field(select.name)),
@ -346,6 +342,7 @@ set_default(format_dispatch, {
"list": (format_list, None, "application/json")
})
def get_pull(column):
if column.nested_path[0] == ".":
return concat_field("fields", literal_field(column.es_column))

34
vendor/jx_elasticsearch/es52/util.py поставляемый
Просмотреть файл

@ -11,6 +11,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from mo_future import text_type
from mo_logs import Log
from jx_base import STRING, BOOLEAN, NUMBER, OBJECT
from jx_elasticsearch.es52.expressions import Variable
from mo_dots import wrap
@ -23,18 +27,21 @@ def es_query_template(path):
:return:
"""
if not isinstance(path, text_type):
Log.error("expecting path to be a string")
if path != ".":
f0 = {}
f1 = {}
output = wrap({
"query": {"bool": {"must": [
"query": es_and([
f0,
{"nested": {
"path": path,
"query": f1,
"inner_hits": {"size": 100000}
}}
]}},
]),
"from": 0,
"size": 0,
"sort": []
@ -43,7 +50,7 @@ def es_query_template(path):
else:
f0 = {}
output = wrap({
"query": {"bool": {"must": [f0]}},
"query": es_and([f0]),
"from": 0,
"size": 0,
"sort": []
@ -66,7 +73,7 @@ def jx_sort_to_es_sort(sort, schema):
for type in types:
for c in cols:
if c.type == type:
if c.jx_type == type:
if s.sort == -1:
output.append({c.es_column: "desc"})
else:
@ -109,3 +116,22 @@ aggregates = {
NON_STATISTICAL_AGGS = {"none", "one"}
def es_and(terms):
return wrap({"bool": {"filter": terms}})
def es_or(terms):
return wrap({"bool": {"should": terms}})
def es_not(term):
return wrap({"bool": {"must_not": term}})
def es_script(term):
return wrap({"script": {"lang": "painless", "inline": term}})
def es_missing(term):
return {"bool": {"must_not": {"exists": {"field": term}}}}

697
vendor/jx_elasticsearch/meta.py поставляемый
Просмотреть файл

@ -12,24 +12,28 @@ from __future__ import division
from __future__ import unicode_literals
import itertools
from copy import copy
from itertools import product
from jx_base import STRUCT, Table
import jx_base
from jx_base.namespace import Namespace
from mo_math import MAX
from mo_collections.relation import Relation_usingList
from jx_base import STRUCT, TableDesc, BOOLEAN
from jx_base.query import QueryOp
from jx_base.schema import Schema
from jx_python import jx, meta as jx_base_meta
from jx_python.containers.list_usingPythonList import ListContainer
from jx_python.meta import ColumnList, Column
from mo_dots import Data, relative_field, concat_field, SELF_PATH, ROOT_PATH, coalesce, set_default, Null, split_field, join_field, wrap
from mo_json.typed_encoder import EXISTS_TYPE
from mo_dots import Data, relative_field, SELF_PATH, ROOT_PATH, coalesce, set_default, Null, split_field, join_field, wrap, concat_field, startswith_field, literal_field
from mo_json.typed_encoder import EXISTS_TYPE, TYPE_PREFIX, untype_path, unnest_path
from mo_kwargs import override
from mo_logs import Log
from mo_logs.strings import quote
from mo_threads import Queue, THREAD_STOP, Thread, Till
from mo_times import HOUR, MINUTE, Timer, Date
from pyLibrary.env import elasticsearch
from pyLibrary.env.elasticsearch import es_type_to_json_type
from pyLibrary.env.elasticsearch import es_type_to_json_type, _get_best_type_from_mapping
MAX_COLUMN_METADATA_AGE = 12 * HOUR
ENABLE_META_SCAN = False
@ -39,9 +43,9 @@ OLD_METADATA = MINUTE
TEST_TABLE_PREFIX = "testing" # USED TO TURN OFF COMPLAINING ABOUT TEST INDEXES
class FromESMetadata(Schema):
class ElasticsearchMetadata(Namespace):
"""
QUERY THE METADATA
MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
"""
def __new__(cls, *args, **kwargs):
@ -59,21 +63,31 @@ class FromESMetadata(Schema):
self.too_old = TOO_OLD
self.settings = kwargs
self.default_name = coalesce(name, alias, index)
self.default_es = elasticsearch.Cluster(kwargs=kwargs)
self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
self.index_does_not_exist = set()
self.todo = Queue("refresh metadata", max=100000, unique=True)
self.index_to_alias = Relation_usingList()
self.es_metadata = Null
self.abs_columns = set()
# self.abs_columns = set()
self.last_es_metadata = Date.now()-OLD_METADATA
self.meta = Data()
table_columns = metadata_tables()
column_columns = metadata_columns()
self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
self.meta.columns = ColumnList()
self.meta.columns.insert(column_columns)
self.meta.columns.insert(table_columns)
self.alias_to_query_paths = {
"meta.columns": [['.']],
"meta.tables": [['.']]
}
self.alias_new_since = {
"meta.columns": Date.now(),
"meta.tables": Date.now()
}
table_columns = metadata_tables()
self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns))
self.meta.columns.extend(table_columns)
# TODO: fix monitor so it does not bring down ES
if ENABLE_META_SCAN:
self.worker = Thread.run("refresh metadata", self.monitor)
@ -81,79 +95,52 @@ class FromESMetadata(Schema):
self.worker = Thread.run("refresh metadata", self.not_monitor)
return
@property
def query_path(self):
return None
@property
def url(self):
return self.default_es.path + "/" + self.default_name.replace(".", "/")
return self.es_cluster.path + "/" + self.default_name.replace(".", "/")
def get_table(self, table_name):
with self.meta.tables.locker:
return wrap([t for t in self.meta.tables.data if t.name == table_name])
def _reload_columns(self, alias=None):
"""
:param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
:return:
"""
# FIND ALL INDEXES OF ALIAS
canonical_index = self.es_cluster.get_best_matching_index(alias).index
times = self.es_cluster.index_new_since
def _upsert_column(self, c):
# ASSUMING THE self.meta.columns.locker IS HAD
existing_columns = self.meta.columns.find(c.es_index, c.names["."])
for canonical in existing_columns:
if canonical.type == c.type and canonical is not c:
set_default(c.names, canonical.names)
for key in Column.__slots__:
canonical[key] = c[key]
if DEBUG:
Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
self.todo.add(canonical)
break
else:
self.meta.columns.add(c)
self.todo.add(c)
indexes = self.index_to_alias.get_domain(alias)
update_required = not (MAX(times[i] for i in indexes) < self.es_cluster.last_metadata)
metadata = self.es_cluster.get_metadata(force=update_required)
if ENABLE_META_SCAN:
if DEBUG:
Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
# MARK meta.columns AS DIRTY TOO
cols = self.meta.columns.find("meta.columns", None)
for cc in cols:
cc.partitions = cc.cardinality = None
cc.last_updated = Date.now() - TOO_OLD
self.todo.extend(cols)
props = [
# (index, type, properties) TRIPLE
(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
for i, d in metadata.indices.items()
if i in indexes
for t, m in [_get_best_type_from_mapping(d.mappings)]
]
def _get_columns(self, table=None):
# TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
table_path = split_field(table)
es_index = table_path[0]
meta = self.es_metadata.indices[es_index]
if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
self.es_metadata = self.default_es.get_metadata(force=True)
meta = self.es_metadata.indices[es_index]
# CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
dirty = False
all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
# NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
diff = elasticsearch.diff_schema(p2, p1)
if not self.settings.read_only:
for d in diff:
dirty = True
i1.add_property(*d)
meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]
for data_type, properties in meta.mappings.items():
if data_type == "_default_":
continue
properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
self._parse_properties(meta.index, properties, meta)
data_type, mapping = _get_best_type_from_mapping(meta.mappings)
mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
self._parse_properties(alias, mapping, meta)
def _parse_properties(self, abs_index, properties, meta):
# IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
# ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
def add_column(c, query_path):
c.last_updated = Date.now() - TOO_OLD
if query_path[0] != ".":
c.names[query_path[0]] = relative_field(c.names["."], query_path[0])
with self.meta.columns.locker:
for alias in meta.aliases:
c_ = copy(c)
c_.es_index = alias
self._upsert_column(c_)
self._upsert_column(c)
abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties)
self.abs_columns.update(abs_columns)
def _parse_properties(self, alias, mapping, meta):
abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties)
with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
# LIST OF EVERY NESTED PATH
query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
for a, b in itertools.product(query_paths, query_paths):
aa = a[0]
bb = b[0]
@ -166,15 +153,17 @@ class FromESMetadata(Schema):
b.insert(i, aa)
break
for q in query_paths:
q.append(".")
query_paths.append(SELF_PATH)
q.append(SELF_PATH)
query_paths.append(ROOT_PATH)
self.alias_to_query_paths[alias] = query_paths
# ADD RELATIVE COLUMNS
# ADD RELATIVE NAMES
for abs_column in abs_columns:
abs_column = abs_column.__copy__()
abs_column.type = es_type_to_json_type[abs_column.type]
abs_column.last_updated = None
abs_column.jx_type = es_type_to_json_type[abs_column.es_type]
for query_path in query_paths:
add_column(abs_column, query_path)
abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0])
self.todo.add(self.meta.columns.add(abs_column))
pass
def query(self, _query):
@ -191,43 +180,62 @@ class FromESMetadata(Schema):
RETURN METADATA COLUMNS
"""
table_path = split_field(table_name)
es_index_name = table_path[0]
query_path = join_field(table_path[1:])
table = self.get_table(es_index_name)[0]
abs_column_name = None if column_name == None else concat_field(query_path, column_name)
root_table_name = table_path[0]
# FIND ALIAS
if root_table_name in self.alias_new_since:
alias = root_table_name
else:
alias = self.index_to_alias[root_table_name]
if not alias:
self.es_cluster.get_metadata(force=True)
# ENSURE INDEX -> ALIAS IS IN A MAPPING FOR LATER
for a in self.es_cluster.get_aliases():
self.alias_new_since[a.alias] = MAX([self.es_cluster.index_new_since[a.index], self.alias_new_since.get(a.alias)])
self.index_to_alias[a.index] = coalesce(a.alias, a.index)
if root_table_name in self.alias_new_since:
alias = root_table_name
else:
alias = self.index_to_alias[root_table_name]
if not alias:
Log.error("{{table|quote}} does not exist", table=table_name)
now = Date.now()
table = self.get_table(alias)[0]
try:
# LAST TIME WE GOT INFO FOR THIS TABLE
if not table:
table = Table(
name=es_index_name,
table = TableDesc(
name=alias,
url=None,
query_path=['.'],
timestamp=Date.now()
)
with self.meta.tables.locker:
self.meta.tables.add(table)
self._get_columns(table=es_index_name)
elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
table.timestamp = Date.now()
self._get_columns(table=es_index_name)
self._reload_columns(alias=alias)
elif force or table.timestamp < now - MAX_COLUMN_METADATA_AGE:
table.timestamp = now
self._reload_columns(alias=alias)
with self.meta.columns.locker:
columns = self.meta.columns.find(es_index_name, column_name)
if columns:
columns = jx.sort(columns, "names.\\.")
# AT LEAST WAIT FOR THE COLUMNS TO UPDATE
while len(self.todo) and not all(columns.get("last_updated")):
if DEBUG:
columns = self.meta.columns.find(alias, column_name)
columns = jx.sort(columns, "names.\.")
# AT LEAST WAIT FOR THE COLUMNS TO UPDATE
while len(self.todo) and not all(columns.get("last_updated")):
if DEBUG:
if len(columns) > 10:
Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated]))
else:
Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
Till(seconds=1).wait()
return columns
Till(seconds=1).wait()
return columns
except Exception as e:
Log.error("Not expected", cause=e)
if abs_column_name:
Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
return []
def _update_cardinality(self, column):
@ -237,44 +245,42 @@ class FromESMetadata(Schema):
if column.es_index in self.index_does_not_exist:
return
if column.type in STRUCT:
if column.jx_type in STRUCT:
Log.error("not supported")
try:
if column.es_index == "meta.columns":
with self.meta.columns.locker:
partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
self.meta.columns.update({
"set": {
"partitions": partitions,
"count": len(self.meta.columns),
"cardinality": len(partitions),
"multi": 1,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
self.meta.columns.update({
"set": {
"partitions": partitions,
"count": len(self.meta.columns),
"cardinality": len(partitions),
"multi": 1,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
return
if column.es_index == "meta.tables":
with self.meta.columns.locker:
partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
self.meta.columns.update({
"set": {
"partitions": partitions,
"count": len(self.meta.tables),
"cardinality": len(partitions),
"multi": 1,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
self.meta.columns.update({
"set": {
"partitions": partitions,
"count": len(self.meta.tables),
"cardinality": len(partitions),
"multi": 1,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
return
es_index = column.es_index.split(".")[0]
is_text = [cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text"]
is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
if is_text:
# text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
result = self.default_es.post("/" + es_index + "/_search", data={
result = self.es_cluster.post("/" + es_index + "/_search", data={
"aggs": {
"count": {"filter": {"match_all": {}}}
},
@ -284,14 +290,24 @@ class FromESMetadata(Schema):
cardinality = 1001
multi = 1001
elif column.es_column == "_id":
result = self.default_es.post("/" + es_index + "/_search", data={
result = self.es_cluster.post("/" + es_index + "/_search", data={
"query": {"match_all": {}},
"size": 0
})
count = cardinality = result.hits.total
multi = 1
elif column.es_type == BOOLEAN:
result = self.es_cluster.post("/" + es_index + "/_search", data={
"aggs": {
"count": _counting_query(column)
},
"size": 0
})
count = result.hits.total
cardinality = 2
multi = 1
else:
result = self.default_es.post("/" + es_index + "/_search", data={
result = self.es_cluster.post("/" + es_index + "/_search", data={
"aggs": {
"count": _counting_query(column),
"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
@ -308,47 +324,44 @@ class FromESMetadata(Schema):
query = Data(size=0)
if column.es_column == "_id":
with self.meta.columns.locker:
self.meta.columns.update({
"set": {
"count": cardinality,
"cardinality": cardinality,
"multi": 1,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
self.meta.columns.update({
"set": {
"count": cardinality,
"cardinality": cardinality,
"multi": 1,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
return
elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
if DEBUG:
Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
with self.meta.columns.locker:
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
return
elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
if DEBUG:
Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality)
with self.meta.columns.locker:
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
return
elif len(column.nested_path) != 1:
query.aggs["_"] = {
@ -360,7 +373,7 @@ class FromESMetadata(Schema):
else:
query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}
result = self.default_es.post("/" + es_index + "/_search", data=query)
result = self.es_cluster.post("/" + es_index + "/_search", data=query)
aggs = result.aggregations._
if aggs._nested:
@ -368,19 +381,16 @@ class FromESMetadata(Schema):
else:
parts = jx.sort(aggs.buckets.key)
if DEBUG:
Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts)
with self.meta.columns.locker:
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"partitions": parts,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
self.meta.columns.update({
"set": {
"count": count,
"cardinality": cardinality,
"multi": multi,
"partitions": parts,
"last_updated": Date.now()
},
"where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
})
except Exception as e:
# CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
# from tests.test_jx import TEST_TABLE
@ -389,11 +399,10 @@ class FromESMetadata(Schema):
is_test_table = any(column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE])
if is_missing_index and is_test_table:
# WE EXPECT TEST TABLES TO DISAPPEAR
with self.meta.columns.locker:
self.meta.columns.update({
"clear": ".",
"where": {"eq": {"es_index": column.es_index}}
})
self.meta.columns.update({
"clear": ".",
"where": {"eq": {"es_index": column.es_index}}
})
self.index_does_not_exist.add(column.es_index)
else:
self.meta.columns.update({
@ -415,42 +424,42 @@ class FromESMetadata(Schema):
while not please_stop:
try:
if not self.todo:
with self.meta.columns.locker:
old_columns = [
c
for c in self.meta.columns
if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
]
if old_columns:
if DEBUG:
Log.note(
"Old columns {{names|json}} last updated {{dates|json}}",
names=wrap(old_columns).es_column,
dates=[Date(t).format() for t in wrap(old_columns).last_updated]
)
self.todo.extend(old_columns)
# TEST CONSISTENCY
for c, d in product(list(self.todo.queue), list(self.todo.queue)):
if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
Log.error("")
else:
if DEBUG:
Log.note("no more metatdata to update")
old_columns = [
c
for c in self.meta.columns
if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
]
if old_columns:
if DEBUG:
Log.note(
"Old columns {{names|json}} last updated {{dates|json}}",
names=wrap(old_columns).es_column,
dates=[Date(t).format() for t in wrap(old_columns).last_updated]
)
self.todo.extend(old_columns)
# TEST CONSISTENCY
for c, d in product(list(self.todo.queue), list(self.todo.queue)):
if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
Log.error("")
else:
if DEBUG:
Log.note("no more metatdata to update")
column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
if DEBUG:
Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
if column:
if column.es_index in self.index_does_not_exist:
with self.meta.columns.locker:
self.meta.columns.update({
"clear": ".",
"where": {"eq": {"es_index": column.es_index}}
})
if column is THREAD_STOP:
continue
if column.type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
with self.meta.columns.locker:
column.last_updated = Date.now()
if DEBUG:
Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
if column.es_index in self.index_does_not_exist:
self.meta.columns.update({
"clear": ".",
"where": {"eq": {"es_index": column.es_index}}
})
continue
if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
column.last_updated = Date.now()
continue
elif column.last_updated >= Date.now()-TOO_OLD:
continue
@ -471,24 +480,159 @@ class FromESMetadata(Schema):
if c == THREAD_STOP:
break
if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
if c.last_updated >= Date.now()-TOO_OLD:
continue
with self.meta.columns.locker:
self.meta.columns.update({
"set": {
"last_updated": Date.now()
},
"clear":[
"count",
"cardinality",
"multi",
"partitions",
],
"where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
})
self.meta.columns.update({
"set": {
"last_updated": Date.now()
},
"clear":[
"count",
"cardinality",
"multi",
"partitions",
],
"where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
})
if DEBUG:
Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Log.note("Did not get {{col.es_index}}.{{col.es_column}} info", col=c)
def get_table(self, alias_name):
with self.meta.tables.locker:
return wrap([t for t in self.meta.tables.data if t.name == alias_name])
def get_snowflake(self, fact_table_name):
return Snowflake(fact_table_name, self)
def get_schema(self, name):
if name == "meta.columns":
return self.meta.columns.schema
query_path = split_field(name)
return self.get_snowflake(query_path[0]).get_schema(join_field(query_path[1:]))
class Snowflake(object):
"""
REPRESENT ONE ALIAS, AND ITS NESTED ARRAYS
"""
def __init__(self, alias, namespace):
self.alias = alias
self.namespace = namespace
def get_schema(self, query_path):
return Schema(query_path, self)
@property
def query_paths(self):
"""
RETURN A LIST OF ALL NESTED COLUMNS
"""
return self.namespace.alias_to_query_paths[self.alias]
@property
def columns(self):
"""
RETURN ALL COLUMNS FROM ORIGIN OF FACT TABLE
"""
return self.namespace.get_columns(literal_field(self.alias))
class Schema(jx_base.Schema):
"""
REPRESENT JUST ONE TABLE IN A SNOWFLAKE
"""
def __init__(self, query_path, snowflake):
if not isinstance(snowflake.query_paths[0], list):
Log.error("Snowflake query paths should be a list of string tuples (well, technically, a list of lists of strings)")
self.query_path = [
p
for p in snowflake.query_paths
if untype_path(p[0]) == query_path
][0]
self.snowflake = snowflake
def leaves(self, column_name):
"""
:param column_name:
:return: ALL COLUMNS THAT START WITH column_name, NOT INCLUDING DEEPER NESTED COLUMNS
"""
column_name = unnest_path(column_name)
columns = self.columns
deep_path = self.query_path[0]
for path in self.query_path:
output = [
c
for c in columns
if (
(c.names['.'] != "_id" or column_name == "_id") and
c.jx_type not in OBJECTS and
startswith_field(unnest_path(c.names[path]), column_name)
)
]
if output:
return output
return []
def values(self, column_name):
"""
RETURN ALL COLUMNS THAT column_name REFERES TO
"""
column_name = unnest_path(column_name)
columns = self.columns
deep_path = self.query_path[0]
for path in self.query_path:
output = [
c
for c in columns
if (
c.jx_type not in STRUCT and
untype_path(c.names[path]) == column_name
)
]
if output:
return output
return output
def __getitem__(self, column_name):
return self.values(column_name)
@property
def name(self):
return concat_field(self.snowflake.alias, self.query_path[0])
@property
def columns(self):
return self.snowflake.namespace.get_columns(literal_field(self.snowflake.alias))
def map_to_es(self):
"""
RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME
"""
output = {}
for path in self.query_path:
set_default(
output,
{
k: c.es_column
for c in self.snowflake.columns
if c.jx_type not in STRUCT
for rel_name in [c.names[path]]
for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)]
}
)
return output
class Table(jx_base.Table):
def __init__(self, full_name, container):
jx_base.Table.__init__(self, full_name)
self.container=container
self.schema = container.namespace.get_schema(full_name)
def _counting_query(c):
@ -502,7 +646,7 @@ def _counting_query(c):
"aggs": {
"_nested": {"cardinality": {
"field": c.es_column,
"precision_threshold": 10 if c.type in elasticsearch.ES_NUMERIC_TYPES else 100
"precision_threshold": 10 if c.es_type in elasticsearch.ES_NUMERIC_TYPES else 100
}}
}
}
@ -512,59 +656,6 @@ def _counting_query(c):
}}
def metadata_columns():
return wrap(
[
Column(
names={".":c},
es_index="meta.columns",
es_column=c,
type="string",
nested_path=ROOT_PATH
)
for c in [
"type",
"nested_path",
"es_column",
"es_index"
]
] + [
Column(
es_index="meta.columns",
names={".":c},
es_column=c,
type="object",
nested_path=ROOT_PATH
)
for c in [
"names",
"domain",
"partitions"
]
] + [
Column(
names={".": c},
es_index="meta.columns",
es_column=c,
type="long",
nested_path=ROOT_PATH
)
for c in [
"count",
"cardinality"
]
] + [
Column(
names={".": "last_updated"},
es_index="meta.columns",
es_column="last_updated",
type="time",
nested_path=ROOT_PATH
)
]
)
def metadata_tables():
return wrap(
[
@ -572,7 +663,7 @@ def metadata_tables():
names={".": c},
es_index="meta.tables",
es_column=c,
type="string",
es_type="string",
nested_path=ROOT_PATH
)
for c in [
@ -582,29 +673,17 @@ def metadata_tables():
]
]+[
Column(
names={".": "timestamp"},
names={".": c},
es_index="meta.tables",
es_column="timestamp",
type="integer",
es_column=c,
es_type="integer",
nested_path=ROOT_PATH
)
for c in [
"timestamp"
]
]
)
def init_database(sql):
sql.execute("""
CREATE TABLE tables AS (
table_name VARCHAR(200),
alias CHAR
)
""")
OBJECTS = (jx_base.OBJECT, jx_base.EXISTS)

19
vendor/jx_python/__init__.py поставляемый
Просмотреть файл

@ -39,18 +39,18 @@ def _delayed_imports():
MySQL = None
try:
from jx_elasticsearch.meta import FromESMetadata
from jx_elasticsearch.meta import ElasticsearchMetadata
except Exception:
FromESMetadata = None
ElasticsearchSnowflake = None
set_default(container.type2container, {
"mysql": MySQL,
"memory": None,
"meta": FromESMetadata
"meta": ElasticsearchMetadata
})
def wrap_from(frum, schema=None):
def find_container(frum, schema=None):
"""
:param frum:
:param schema:
@ -66,7 +66,6 @@ def wrap_from(frum, schema=None):
Log.error("expecting jx_base.container.config.default.settings to contain default elasticsearch connection info")
type_ = None
index = frum
if frum.startswith("meta."):
if frum == "meta.columns":
return _meta.singlton.meta.columns.denormalized()
@ -74,13 +73,13 @@ def wrap_from(frum, schema=None):
return _meta.singlton.meta.tables
else:
Log.error("{{name}} not a recognized table", name=frum)
else:
type_ = container.config.default.type
index = split_field(frum)[0]
type_ = container.config.default.type
fact_table_name = split_field(frum)[0]
settings = set_default(
{
"index": index,
"index": fact_table_name,
"name": frum,
"exists": True,
},
@ -95,7 +94,7 @@ def wrap_from(frum, schema=None):
return container.type2container[frum.type](frum.settings)
elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))):
from jx_base.query import QueryOp
return QueryOp.wrap(frum, schema=schema)
return QueryOp.wrap(frum, namespace=schema)
elif isinstance(frum, (list, set)):
return _ListContainer("test_list", frum)
else:

Просмотреть файл

@ -14,6 +14,15 @@ from __future__ import unicode_literals
import itertools
from collections import Mapping
from mo_math import UNION
import jx_base
from jx_base import Container
from jx_base.expressions import jx_expression, Expression, Variable, TRUE
from jx_python.expression_compiler import compile_expression
from jx_python.expressions import jx_expression_to_function
from jx_python.lists.aggs import is_aggs, list_aggs
from jx_python.meta import get_schema_from_list
from mo_collections import UniqueIndex
from mo_dots import Data, wrap, listwrap, unwraplist, unwrap, Null
from mo_future import sort_using_key
@ -21,21 +30,17 @@ from mo_logs import Log
from mo_threads import Lock
from pyLibrary import convert
from jx_base.expressions import jx_expression, Expression, TrueOp, Variable, TRUE
from jx_python.expressions import jx_expression_to_function
from jx_base.container import Container
from jx_python.expression_compiler import compile_expression
from jx_python.lists.aggs import is_aggs, list_aggs
from jx_python.meta import get_schema_from_list
_get = object.__getattribute__
class ListContainer(Container):
class ListContainer(Container, jx_base.Namespace, jx_base.Table):
"""
A CONTAINER WITH ONLY ONE TABLE
"""
def __init__(self, name, data, schema=None):
# TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION
data = list(unwrap(data))
Container.__init__(self, data, schema)
Container.__init__(self)
if schema == None:
self._schema = get_schema_from_list(name, data)
else:
@ -52,6 +57,10 @@ class ListContainer(Container):
def schema(self):
return self._schema
@property
def namespace(self):
return self
def last(self):
"""
:return: Last element in the list, or Null
@ -91,7 +100,7 @@ class ListContainer(Container):
elif q.format == "table":
head = [c.names['.'] for c in output.schema.columns]
data = [
[r[h] for h in head]
[r if h == '.' else r[h] for h in head]
for r in output.data
]
return Data(header=head, data=data, meta={"format": "table"})
@ -170,6 +179,13 @@ class ListContainer(Container):
new_schema = None
if isinstance(select, list):
if all(
isinstance(s.value, Variable) and s.name == s.value.var
for s in select
):
names = set(s.value.var for s in select)
new_schema = Schema(".", [c for c in self.schema.columns if c.names['.'] in names])
push_and_pull = [(s.name, jx_expression_to_function(s.value)) for s in selects]
def selector(d):
output = Data()
@ -250,6 +266,23 @@ class ListContainer(Container):
def __len__(self):
return len(self.data)
# class Namespace(jx_base.Namespace):
def get_snowflake(self, name):
if self.name != name:
Log.error("This container only has table by name of {{name}}", name=name)
return self
def get_schema(self, name):
if self.name != name:
Log.error("This container only has table by name of {{name}}", name=name)
return self.schema
def get_table(self, name):
if self.name != name:
Log.error("This container only has table by name of {{name}}", name=name)
return self
def _exec(code):
try:
@ -261,6 +294,7 @@ def _exec(code):
from jx_base.schema import Schema
from jx_python import jx

56
vendor/jx_python/jx.py поставляемый
Просмотреть файл

@ -60,64 +60,64 @@ def get(expr):
return jx_expression_to_function(expr)
def run(query, frum=Null):
def run(query, container=Null):
"""
THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER,
BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer
"""
if frum == None:
frum = wrap(query)['from']
query_op = QueryOp.wrap(query, table=frum, schema=frum.schema)
if container == None:
container = wrap(query)['from'].container
query_op = QueryOp.wrap(query, container=container, namespace=container.schema)
else:
query_op = QueryOp.wrap(query, frum, frum.schema)
query_op = QueryOp.wrap(query, container, container.namespace)
if frum == None:
if container == None:
from jx_python.containers.list_usingPythonList import DUAL
return DUAL.query(query_op)
elif isinstance(frum, Container):
return frum.query(query_op)
elif isinstance(frum, (list, set) + generator_types):
frum = wrap(list(frum))
elif isinstance(frum, Cube):
elif isinstance(container, Container):
return container.query(query_op)
elif isinstance(container, (list, set) + generator_types):
container = wrap(list(container))
elif isinstance(container, Cube):
if is_aggs(query_op):
return cube_aggs(frum, query_op)
elif isinstance(frum, QueryOp):
frum = run(frum)
return cube_aggs(container, query_op)
elif isinstance(container, QueryOp):
container = run(container)
else:
Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
Log.error("Do not know how to handle {{type}}", type=container.__class__.__name__)
if is_aggs(query_op):
frum = list_aggs(frum, query_op)
container = list_aggs(container, query_op)
else: # SETOP
if query_op.where is not TRUE:
frum = filter(frum, query_op.where)
container = filter(container, query_op.where)
if query_op.sort:
frum = sort(frum, query_op.sort, already_normalized=True)
container = sort(container, query_op.sort, already_normalized=True)
if query_op.select:
frum = select(frum, query_op.select)
container = select(container, query_op.select)
if query_op.window:
if isinstance(frum, Cube):
frum = list(frum.values())
if isinstance(container, Cube):
container = list(container.values())
for param in query_op.window:
window(frum, param)
window(container, param)
# AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT
if query_op.format == "cube":
frum = convert.list2cube(frum)
container = convert.list2cube(container)
elif query_op.format == "table":
frum = convert.list2table(frum)
frum.meta.format = "table"
container = convert.list2table(container)
container.meta.format = "table"
else:
frum = wrap({
container = wrap({
"meta": {"format": "list"},
"data": frum
"data": container
})
return frum
return container
groupby = group_by.groupby

305
vendor/jx_python/meta.py поставляемый
Просмотреть файл

@ -11,19 +11,18 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from collections import Mapping
from datetime import date
from datetime import datetime
from jx_base import STRUCT, Column
from jx_base.container import Container
import jx_base
from jx_base import python_type_to_json_type
from jx_base import STRUCT, Column, Table
from jx_base.schema import Schema
from jx_python import jx
from mo_collections import UniqueIndex
from mo_dots import Data, concat_field, get_attr, listwrap, unwraplist, NullType, FlatList
from mo_dots import split_field, join_field, ROOT_PATH
from mo_dots import wrap
from mo_future import none_type
from mo_future import text_type, long, PY2
from mo_dots import Data, concat_field, get_attr, listwrap, unwraplist, NullType, FlatList, set_default, split_field, join_field, ROOT_PATH, wrap
from mo_future import none_type, text_type, long, PY2
from mo_json.typed_encoder import untype_path, unnest_path
from mo_logs import Log
from mo_threads import Lock
@ -32,48 +31,101 @@ from mo_times.dates import Date
singlton = None
class ColumnList(Container):
class ColumnList(Table):
"""
OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED
"""
def __init__(self):
Table.__init__(self, "meta.columns")
self.data = {} # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
self.locker = Lock()
self.count = 0
self.meta_schema = None
self._schema = None
self.extend(METADATA_COLUMNS)
def find(self, es_index, abs_column_name):
if "." in es_index and not es_index.startswith("meta."):
Log.error("unlikely index name")
if not abs_column_name:
return [c for cs in self.data.get(es_index, {}).values() for c in cs]
else:
return self.data.get(es_index, {}).get(abs_column_name, [])
with self.locker:
if es_index.startswith("meta."):
self._update_meta()
def insert(self, columns):
for column in columns:
self.add(column)
if not abs_column_name:
return [c for cs in self.data.get(es_index, {}).values() for c in cs]
else:
return self.data.get(es_index, {}).get(abs_column_name, [])
def extend(self, columns):
self.dirty = True
with self.locker:
for column in columns:
self._add(column)
def add(self, column):
self.dirty = True
with self.locker:
return self._add(column)
def _add(self, column):
columns_for_table = self.data.setdefault(column.es_index, {})
abs_cname = column.names["."]
_columns = columns_for_table.get(abs_cname)
if not _columns:
_columns = columns_for_table[abs_cname] = []
_columns.append(column)
self.count += 1
existing_columns = columns_for_table.setdefault(column.names["."], [])
for canonical in existing_columns:
if canonical is column:
return canonical
if canonical.es_type == column.es_type:
set_default(column.names, canonical.names)
for key in Column.__slots__:
canonical[key] = column[key]
return canonical
existing_columns.append(column)
return column
def _update_meta(self):
if not self.dirty:
return
for mcl in self.data.get("meta.columns").values():
for mc in mcl:
count = 0
values = set()
objects = 0
multi = 1
for t, cs in self.data.items():
for c, css in cs.items():
for column in css:
value = column[mc.names["."]]
if value == None:
pass
else:
count += 1
if isinstance(value, list):
multi = max(multi, len(value))
try:
values |= set(value)
except Exception:
objects += len(value)
elif isinstance(value, Mapping):
objects += 1
else:
values.add(value)
mc.count = count
mc.cardinality = len(values) + objects
mc.partitions = jx.sort(values)
mc.multi = multi
mc.last_updated = Date.now()
self.dirty = False
def __iter__(self):
self._update_meta()
for t, cs in self.data.items():
for c, css in cs.items():
for column in css:
yield column
def __len__(self):
return self.count
return self.data['meta.columns']['es_index'].count
def update(self, command):
self.dirty = True
try:
command = wrap(command)
eq = command.where.eq
@ -81,62 +133,84 @@ class ColumnList(Container):
columns = self.find(eq.es_index, eq.name)
columns = [c for c in columns if all(get_attr(c, k) == v for k, v in eq.items())]
else:
columns = list(self)
columns = jx.filter(columns, command.where)
with self.locker:
columns = list(self)
columns = jx.filter(columns, command.where)
for col in list(columns):
for k in command["clear"]:
if k == ".":
columns.remove(col)
else:
col[k] = None
with self.locker:
for col in list(columns):
for k in command["clear"]:
if k == ".":
columns.remove(col)
else:
col[k] = None
for k, v in command.set.items():
col[k] = v
for k, v in command.set.items():
col[k] = v
except Exception as e:
Log.error("should not happen", cause=e)
def query(self, query):
query.frum = self.__iter__()
output = jx.run(query)
with self.locker:
self._update_meta()
query.frum = self.__iter__()
output = jx.run(query)
return output
def groupby(self, keys):
return jx.groupby(self.__iter__(), keys)
with self.locker:
self._update_meta()
return jx.groupby(self.__iter__(), keys)
@property
def schema(self):
return wrap({k: set(v) for k, v in self.data["meta.columns"].items()})
if not self._schema:
with self.locker:
self._update_meta()
self._schema = Schema(".", [c for cs in self.data["meta.columns"].values() for c in cs])
return self._schema
@property
def namespace(self):
return self
def denormalized(self):
"""
THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
"""
output = [
{
"table": concat_field(c.es_index, untype_path(table)),
"name": untype_path(name),
"cardinality": c.cardinality,
"es_column": c.es_column,
"es_index": c.es_index,
"last_updated": c.last_updated,
"count": c.count,
"nested_path": [unnest_path(n) for n in c.nested_path],
"type": c.type
}
for tname, css in self.data.items()
for cname, cs in css.items()
for c in cs
if c.type not in STRUCT # and c.es_column != "_id"
for table, name in c.names.items()
]
if not self.meta_schema:
self.meta_schema = get_schema_from_list("meta\\.columns", output)
with self.locker:
self._update_meta()
output = [
{
"table": concat_field(c.es_index, untype_path(table)),
"name": untype_path(name),
"cardinality": c.cardinality,
"es_column": c.es_column,
"es_index": c.es_index,
"last_updated": c.last_updated,
"count": c.count,
"nested_path": [unnest_path(n) for n in c.nested_path],
"es_type": c.es_type,
"type": c.jx_type
}
for tname, css in self.data.items()
for cname, cs in css.items()
for c in cs
if c.jx_type not in STRUCT # and c.es_column != "_id"
for table, name in c.names.items()
]
from jx_python.containers.list_usingPythonList import ListContainer
return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
return ListContainer(
self.name,
data=output,
schema=jx_base.Schema(
"meta.columns",
SIMPLE_METADATA_COLUMNS
)
)
def get_schema_from_list(table_name, frum):
@ -169,11 +243,13 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
names={table_name: full_name},
es_column=full_name,
es_index=".",
type="undefined",
jx_type=python_type_to_json_type[d.__class__],
es_type=row_type,
nested_path=nested_path
)
columns.add(column)
column.type = _merge_type[column.type][row_type]
column.es_type = _merge_type[column.es_type][row_type]
column.jx_type = _merge_type[column.jx_type][row_type]
else:
for name, value in d.items():
full_name = concat_field(parent, name)
@ -183,7 +259,7 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
names={table_name: full_name},
es_column=full_name,
es_index=".",
type="undefined",
es_type="undefined",
nested_path=nested_path
)
columns.add(column)
@ -199,20 +275,87 @@ def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
this_type = "nested"
else:
this_type = _type_to_name[value.__class__]
new_type = _merge_type[column.type][this_type]
if new_type == None:
Log.error("can not combine {{type1}} with {{type2}} for column {{column}}", type1=column.type, type2=this_type, column=full_name)
column.type = new_type
new_type = _merge_type[column.es_type][this_type]
column.es_type = new_type
if this_type == "object":
_get_schema_from_list([value], table_name, full_name, nested_path, columns)
elif this_type == "nested":
np = listwrap(nested_path)
newpath = unwraplist([join_field(split_field(np[0])+[name])]+np)
newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
_get_schema_from_list(value, table_name, full_name, newpath, columns)
METADATA_COLUMNS = (
[
Column(
names={".": c},
es_index="meta.columns",
es_column=c,
es_type="string",
nested_path=ROOT_PATH
)
for c in ["es_type", "jx_type", "nested_path", "es_column", "es_index"]
] + [
Column(
es_index="meta.columns",
names={".": c},
es_column=c,
es_type="object",
nested_path=ROOT_PATH
)
for c in ["names", "partitions"]
] + [
Column(
names={".": c},
es_index="meta.columns",
es_column=c,
es_type="long",
nested_path=ROOT_PATH
)
for c in ["count", "cardinality", "multi"]
] + [
Column(
names={".": "last_updated"},
es_index="meta.columns",
es_column="last_updated",
es_type="time",
nested_path=ROOT_PATH
)
]
)
SIMPLE_METADATA_COLUMNS = (
[
Column(
names={".": c},
es_index="meta.columns",
es_column=c,
es_type="string",
nested_path=ROOT_PATH
)
for c in ["table", "name", "type", "nested_path"]
] + [
Column(
names={".": c},
es_index="meta.columns",
es_column=c,
es_type="long",
nested_path=ROOT_PATH
)
for c in ["count", "cardinality", "multi"]
] + [
Column(
names={".": "last_updated"},
es_index="meta.columns",
es_column="last_updated",
es_type="time",
nested_path=ROOT_PATH
)
]
)
_type_to_name = {
none_type: "undefined",
NullType: "undefined",
@ -242,6 +385,7 @@ _merge_type = {
"long": "long",
"float": "float",
"double": "double",
"number": "number",
"string": "string",
"object": "object",
"nested": "nested"
@ -253,6 +397,7 @@ _merge_type = {
"long": "long",
"float": "float",
"double": "double",
"number": "number",
"string": "string",
"object": None,
"nested": None
@ -264,6 +409,7 @@ _merge_type = {
"long": "long",
"float": "float",
"double": "double",
"number": "number",
"string": "string",
"object": None,
"nested": None
@ -275,6 +421,7 @@ _merge_type = {
"long": "long",
"float": "double",
"double": "double",
"number": "number",
"string": "string",
"object": None,
"nested": None
@ -286,6 +433,7 @@ _merge_type = {
"long": "double",
"float": "float",
"double": "double",
"number": "number",
"string": "string",
"object": None,
"nested": None
@ -297,6 +445,19 @@ _merge_type = {
"long": "double",
"float": "double",
"double": "double",
"number": "number",
"string": "string",
"object": None,
"nested": None
},
"number": {
"undefined": "number",
"boolean": "number",
"integer": "number",
"long": "number",
"float": "number",
"double": "number",
"number": "number",
"string": "string",
"object": None,
"nested": None
@ -308,6 +469,7 @@ _merge_type = {
"long": "string",
"float": "string",
"double": "string",
"number": "string",
"string": "string",
"object": None,
"nested": None
@ -319,6 +481,7 @@ _merge_type = {
"long": None,
"float": None,
"double": None,
"number": None,
"string": None,
"object": "object",
"nested": "nested"
@ -330,9 +493,9 @@ _merge_type = {
"long": None,
"float": None,
"double": None,
"number": None,
"string": None,
"object": "nested",
"nested": "nested"
}
}

59
vendor/jx_python/namespace/__init__.py поставляемый
Просмотреть файл

@ -1,59 +0,0 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from mo_dots import set_default, Data
from jx_base.query import QueryOp
class Namespace(object):
def convert(self, expr):
raise NotImplementedError()
def _convert_query(self, query):
output = QueryOp("from", None)
output.select = self._convert_clause(query.select)
output.where = self.convert(query.where)
output["from"] = self._convert_from(query["from"])
output.edges = self._convert_clause(query.edges)
output.having = convert_list(self._convert_having, query.having)
output.window = convert_list(self._convert_window, query.window)
output.sort = self._convert_clause(query.sort)
output.format = query.format
return output
def _convert_from(self, frum):
raise NotImplementedError()
def _convert_clause(self, clause):
raise NotImplementedError()
def _convert_having(self, clause):
raise NotImplementedError()
def _convert_window(self, clause):
raise NotImplementedError()
def convert_list(operator, operand):
if operand==None:
return None
elif isinstance(operand, Mapping):
return operator(operand)
else:
return map(operator, operand)

4
vendor/jx_python/table.py поставляемый
Просмотреть файл

@ -10,10 +10,12 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
import jx_base
from mo_dots import Data
class Table(object):
class Table(jx_base.Table):
__slots__ = ['header', 'data', 'meta']

45
vendor/mo_collections/relation.py поставляемый
Просмотреть файл

@ -12,40 +12,61 @@ from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from mo_logs import Log
class Relation_usingList(object):
def __init__(self):
self.all=set()
self.all = set()
def len(self):
return len(self.all)
def add(self, key, value):
test = (key, value)
if test not in self.all:
self.all.add(test)
def testAndAdd(self, key, value):
"""
RETURN TRUE IF THIS RELATION IS NET-NEW
"""
test = (key, value)
if test not in self.all:
self.all.add(test)
return True
return False
output = test not in self.all
self.all.add(test)
return output
def extend(self, key, values):
for v in values:
self.add(key, v)
self[key] = v
def __getitem__(self, key):
"""
USE THIS IF YOU ARE CONFIDENT THIS IS A MANY-TO-ONE MAPPING
RETURN THE SINGLE CO-DOMAIN OBJECT THIS key MAPS TO
"""
output = [v for k, v in self.all if k == key]
if not output:
return None
elif len(output) == 1:
return output[0]
else:
Log.error("Not allowed")
def __setitem__(self, key, value):
self.all.add((key, value))
def get_domain(self, value):
"""
RETURN domain FOR GIVEN CODOMAIN
:param value:
:return:
"""
return [k for k, v in self.all if v == value]
def get_codomain(self, key):
"""
RETURN AN ARRAY OF OBJECTS THAT key MAPS TO
"""
return [v for k, v in self.all if k == key]
class Relation(object):
def __init__(self):
self.map = dict()
@ -96,5 +117,3 @@ class Relation(object):
def domain(self):
return self.map.keys()

8
vendor/mo_dots/__init__.py поставляемый
Просмотреть файл

@ -411,6 +411,12 @@ def lower_match(value, candidates):
def wrap(v):
"""
WRAP AS Data OBJECT FOR DATA PROCESSING: https://github.com/klahnakoski/mo-dots/tree/dev/docs
:param v: THE VALUE TO WRAP
:return: Data INSTANCE
"""
type_ = _get(v, "__class__")
if type_ is dict:
@ -422,7 +428,7 @@ def wrap(v):
elif type_ is list:
return FlatList(v)
elif type_ in generator_types:
return FlatList(list(v))
return FlatList(list(unwrap(vv) for vv in v))
else:
return v

28
vendor/mo_dots/lists.py поставляемый
Просмотреть файл

@ -19,12 +19,20 @@ from mo_dots.nones import Null
_get = object.__getattribute__
_set = object.__setattr__
_emit_slice_warning = True
_datawrap = None
Log = None
def _late_import():
global _datawrap
global Log
from mo_dots.objects import datawrap as _datawrap
try:
from mo_logs import Log
except Exception:
from mo_dots.utils import PoorLogger as Log
_ = _datawrap
@ -33,6 +41,7 @@ class FlatList(list):
"""
ENCAPSULATES HANDING OF Nulls BY wrapING ALL MEMBERS AS NEEDED
ENCAPSULATES FLAT SLICES ([::]) FOR USE IN WINDOW FUNCTIONS
https://github.com/klahnakoski/mo-dots/tree/dev/docs#flatlist-is-flat
"""
EMPTY = None
@ -50,7 +59,8 @@ class FlatList(list):
if isinstance(index, slice):
# IMPLEMENT FLAT SLICES (for i not in range(0, len(self)): assert self[i]==None)
if index.step is not None:
Log = _late_import()
if not Log:
_late_import()
Log.error("slice step must be None, do not know how to deal with values")
length = len(_get(self, "list"))
@ -78,7 +88,8 @@ class FlatList(list):
_list.append(None)
_list[i] = unwrap(y)
except Exception as e:
Log = _late_import()
if not Log:
_late_import()
Log.error("problem", cause=e)
def __getattribute__(self, key):
@ -95,20 +106,22 @@ class FlatList(list):
"""
simple `select`
"""
if not _datawrap:
if not Log:
_late_import()
return FlatList(vals=[unwrap(coalesce(_datawrap(v), Null)[key]) for v in _get(self, "list")])
def select(self, key):
Log = _late_import()
if not Log:
_late_import()
Log.error("Not supported. Use `get()`")
def filter(self, _filter):
return FlatList(vals=[unwrap(u) for u in (wrap(v) for v in _get(self, "list")) if _filter(u)])
def __delslice__(self, i, j):
Log = _late_import()
if not Log:
_late_import()
Log.error("Can not perform del on slice: modulo arithmetic was performed on the parameters. You can try using clear()")
def __clear__(self):
@ -134,8 +147,9 @@ class FlatList(list):
global _emit_slice_warning
if _emit_slice_warning:
_emit_slice_warning=False
Log = _late_import()
_emit_slice_warning = False
if not Log:
_late_import()
Log.warning("slicing is broken in Python 2.7: a[i:j] == a[i+len(a), j] sometimes. Use [start:stop:step] (see https://github.com/klahnakoski/pyLibrary/blob/master/pyLibrary/dot/README.md#the-slice-operator-in-python27-is-inconsistent)")
return self[i:j:]

12
vendor/mo_files/__init__.py поставляемый
Просмотреть файл

@ -408,6 +408,10 @@ class File(object):
class TempDirectory(File):
"""
A CONTEXT MANAGER FOR AN ALLOCATED, BUT UNOPENED TEMPORARY DIRECTORY
WILL BE DELETED WHEN EXITED
"""
def __new__(cls):
return File.__new__(cls, None)
@ -418,10 +422,14 @@ class TempDirectory(File):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
Thread.run("delete "+self.name, delete_daemon, file=self)
Thread.run("delete dir "+self.name, delete_daemon, file=self)
class TempFile(File):
"""
A CONTEXT MANAGER FOR AN ALLOCATED, BUT UNOPENED TEMPORARY FILE
WILL BE DELETED WHEN EXITED
"""
def __new__(cls, *args, **kwargs):
return object.__new__(cls)
@ -434,7 +442,7 @@ class TempFile(File):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
Thread.run("delete "+self.name, delete_daemon, file=self)
Thread.run("delete file "+self.name, delete_daemon, file=self)
def _copy(from_, to_):

11
vendor/mo_future/__init__.py поставляемый
Просмотреть файл

@ -37,8 +37,15 @@ if PY3:
unichr = chr
xrange = range
filter_type = type(filter(lambda x: True, []))
generator_types = (collections.Iterable, filter_type)
def _gen():
yield
generator_types = (
type(_gen()),
type(filter(lambda x: True, [])),
type({}.items()),
type({}.values())
)
unichr = chr
round = round

2
vendor/mo_json_config/__init__.py поставляемый
Просмотреть файл

@ -56,6 +56,8 @@ def expand(doc, doc_url="param://", params=None):
ASSUMING YOU ALREADY PULED THE doc FROM doc_url, YOU CAN STILL USE THE
EXPANDING FEATURE
USE mo_json_config.expand({}) TO ASSUME CURRENT WORKING DIRECTORY
:param doc: THE DATA STRUCTURE FROM JSON SOURCE
:param doc_url: THE URL THIS doc CAME FROM (DEFAULT USES params AS A DOCUMENT SOURCE)
:param params: EXTRA PARAMETERS NOT FOUND IN THE doc_url PARAMETERS (WILL SUPERSEDE PARAMETERS FROM doc_url)

3
vendor/mo_kwargs/__init__.py поставляемый
Просмотреть файл

@ -97,9 +97,10 @@ def override(func):
if e.message.startswith(func_name) and "takes at least" in e:
missing = [p for p in params if str(p) not in packed]
get_logger().error(
"Problem calling {{func_name}}: Expecting parameter {{missing}}",
"Problem calling {{func_name}}: Expecting parameter {{missing}}, given {{given}}",
func_name=func_name,
missing=missing,
given=packed.keys(),
stack_depth=1
)
get_logger().error("Error dispatching call", e)

15
vendor/mo_logs/__init__.py поставляемый
Просмотреть файл

@ -103,7 +103,13 @@ class Log(object):
@classmethod
def stop(cls):
from mo_logs import profiles
"""
DECONSTRUCTS ANY LOGGING, AND RETURNS TO DIRECT-TO-stdout LOGGING
EXECUTING MULUTIPLE TIMES IN A ROW IS SAFE, IT HAS NO NET EFFECT, IT STILL LOGS TO stdout
:return: NOTHING
"""
from mo_threads import profiles
if cls.cprofiler and hasattr(cls, "settings"):
if cls.cprofiler == None:
@ -429,7 +435,6 @@ class Log(object):
trace = exceptions.extract_stack(stack_depth + 1)
e = Except(exceptions.ERROR, template, params, cause, trace)
str_e = text_type(e)
error_mode = cls.error_mode
with suppress_exception:
@ -443,7 +448,7 @@ class Log(object):
)
cls.error_mode = error_mode
sys.stderr.write(str_e.encode('utf8'))
sys.stderr.write(str(e))
def write(self):
@ -475,6 +480,10 @@ def write_profile(profile_settings, stats):
stats_file.write(convert.list2tab(stats))
def _same_frame(frameA, frameB):
return (frameA.line, frameA.file) == (frameB.line, frameB.file)
# GET THE MACHINE METADATA
machine_metadata = wrap({
"pid": os.getpid(),

7
vendor/mo_logs/exceptions.py поставляемый
Просмотреть файл

@ -55,6 +55,13 @@ class Except(Exception):
@classmethod
def wrap(cls, e, stack_depth=0):
"""
ENSURE THE STACKTRACE AND CAUSAL CHAIN IS CAPTURED, PLUS ADD FEATURES OF Except
:param e: AN EXCEPTION OF ANY TYPE
:param stack_depth: HOW MANY CALLS TO TAKE OFF THE TOP OF THE STACK TRACE
:return: A Except OBJECT OF THE SAME
"""
if e == None:
return Null
elif isinstance(e, (list, Except)):

7
vendor/mo_logs/log_usingElasticSearch.py поставляемый
Просмотреть файл

@ -37,6 +37,10 @@ class StructuredLogger_usingElasticSearch(StructuredLogger):
"""
settings ARE FOR THE ELASTICSEARCH INDEX
"""
kwargs.timeout = Duration(coalesce(self.es.settings.timeout, "30second")).seconds
kwargs.retry.times = coalesce(self.es.settings.retry.times, 3)
kwargs.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)).seconds
self.es = Cluster(kwargs).get_or_create_index(
schema=mo_json.json2value(value2json(SCHEMA), leaves=True),
limit_replicas=True,
@ -46,8 +50,7 @@ class StructuredLogger_usingElasticSearch(StructuredLogger):
self.batch_size = batch_size
self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
self.queue = Queue("debug logs to es", max=max_size, silent=True)
self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
Thread.run("add debug logs to es", self._insert_loop)
def write(self, template, params):

13
vendor/mo_logs/log_usingStream.py поставляемый
Просмотреть файл

@ -22,8 +22,9 @@ from mo_logs.strings import expand_template
class StructuredLogger_usingStream(StructuredLogger):
def __init__(self, stream):
self.locker = allocate_lock()
try:
self.locker = allocate_lock()
self.flush = stream.flush
if stream in (sys.stdout, sys.stderr):
if PY3:
self.writer = stream.write
@ -33,8 +34,8 @@ class StructuredLogger_usingStream(StructuredLogger):
self.writer = _UTF8Encoder(stream).write
else:
self.writer = stream.write
except Exception as e:
sys.stderr("can not handle")
except Exception as _:
sys.stderr.write("can not handle")
def write(self, template, params):
value = expand_template(template, params)
@ -45,7 +46,7 @@ class StructuredLogger_usingStream(StructuredLogger):
self.locker.release()
def stop(self):
pass
self.flush()
class _UTF8Encoder(object):
@ -56,5 +57,5 @@ class _UTF8Encoder(object):
def write(self, v):
try:
self.stream.write(v.encode('utf8'))
except Exception as e:
sys.stderr("can not handle")
except Exception as _:
sys.stderr.write("can not handle")

4
vendor/mo_logs/log_usingThreadedStream.py поставляемый
Просмотреть файл

@ -93,7 +93,9 @@ def time_delta_pusher(please_stop, appender, queue, interval):
next_run = time() + interval
while not please_stop:
Thread.current().cprofiler.disable()
(Till(till=next_run) | please_stop).wait()
Thread.current().cprofiler.enable()
next_run = time() + interval
logs = queue.pop_all()
if not logs:
@ -116,7 +118,7 @@ def time_delta_pusher(please_stop, appender, queue, interval):
appender(u"\n".join(lines) + u"\n")
except Exception as e:
sys.stderr.write(b"Trouble with appender: " + str(e.__class__.__name__) + b"\n")
sys.stderr.write(str("Trouble with appender: ") + str(e.__class__.__name__) + str("\n"))
# SWALLOW ERROR, MUST KEEP RUNNING

51
vendor/mo_logs/startup.py поставляемый
Просмотреть файл

@ -20,7 +20,7 @@ import tempfile
import mo_json_config
from mo_files import File
from mo_logs import Log
from mo_dots import listwrap, wrap, unwrap
from mo_dots import listwrap, wrap, unwrap, coalesce
# PARAMETERS MATCH argparse.ArgumentParser.add_argument()
@ -58,41 +58,34 @@ def argparse(defs):
return wrap(output)
def read_settings(filename=None, defs=None, env_filename=None):
def read_settings(filename=None, defs=None):
"""
:param filename: Force load a file
:param defs: arguments you want to accept
:param env_filename: A config file from an environment variable (a fallback config file, if no other provided)
:param default_filename: A config file from an environment variable (a fallback config file, if no other provided)
:return:
"""
# READ SETTINGS
if filename:
settings_file = File(filename)
if not settings_file.exists:
Log.error("Can not file settings file {{filename}}", {
"filename": settings_file.abspath
})
settings = mo_json_config.get("file:///" + settings_file.abspath)
if defs:
settings.args = argparse(defs)
return settings
else:
defs = listwrap(defs)
defs.append({
"name": ["--config", "--settings", "--settings-file", "--settings_file"],
"help": "path to JSON file with settings",
"type": str,
"dest": "filename",
"default": "config.json",
"required": False
})
args = argparse(defs)
defs = listwrap(defs)
defs.append({
"name": ["--config", "--settings", "--settings-file", "--settings_file"],
"help": "path to JSON file with settings",
"type": str,
"dest": "filename",
"default": None,
"required": False
})
args = argparse(defs)
if env_filename:
args.filename = env_filename
settings = mo_json_config.get("file://" + args.filename.replace(os.sep, "/"))
settings.args = args
return settings
args.filename = coalesce(filename, args.filename, "./config.json")
settings_file = File(args.filename)
if not settings_file.exists:
Log.error("Can not read configuration file {{filename}}", {
"filename": settings_file.abspath
})
settings = mo_json_config.get("file:///" + settings_file.abspath)
settings.args = args
return settings
# snagged from https://github.com/pycontribs/tendo/blob/master/tendo/singleton.py (under licence PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2)

6
vendor/mo_math/__init__.py поставляемый
Просмотреть файл

@ -297,6 +297,12 @@ def MIN(values, *others):
def MAX(values, *others):
"""
DECISIVE MAX
:param values:
:param others:
:return:
"""
if others:
from mo_logs import Log

3
vendor/mo_testing/fuzzytestcase.py поставляемый
Просмотреть файл

@ -86,6 +86,8 @@ def assertAlmostEqual(test, expected, digits=None, places=None, msg=None, delta=
return
elif test is expected:
return
elif isinstance(expected, text_type):
assertAlmostEqualValue(test, expected, msg=msg, digits=digits, places=places, delta=delta)
elif isinstance(test, UniqueIndex):
if test ^ expected:
Log.error("Sets do not match")
@ -196,7 +198,6 @@ def assertAlmostEqualValue(test, expected, digits=None, places=None, msg=None, d
if diff < Math.ceiling(Math.log10(abs(test)))-places:
return
standardMsg = expand_template("{{test|json}} != {{expected|json}} within {{places}} places", locals())
raise AssertionError(coalesce(msg, "") + ": (" + standardMsg + ")")

21
vendor/mo_threads/__init__.py поставляемый
Просмотреть файл

@ -15,15 +15,21 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from mo_future import get_function_name
from mo_logs import Log
from mo_threads.lock import Lock
from mo_threads.signal import Signal
from mo_threads.till import Till
from mo_threads.threads import Thread, THREAD_STOP, THREAD_TIMEOUT
from mo_threads.queues import Queue
from mo_threads.queues import ThreadedQueue
from mo_threads.multiprocess import Process
from mo_threads.queues import Queue, ThreadedQueue
from mo_threads.signal import Signal
from mo_threads.threads import Thread, THREAD_STOP, THREAD_TIMEOUT, MainThread, stop_main_thread, MAIN_THREAD
from mo_threads.till import Till
Log.cprofiler_stats = Queue("cprofiler stats") # ACCUMULATION OF STATS FROM ALL THREADS
MAIN_THREAD.timers = Thread.run("timers daemon", till.daemon)
MAIN_THREAD.children.remove(threads.MAIN_THREAD.timers)
# from threading import Thread as _threading_Thread
@ -78,3 +84,4 @@ from mo_threads.multiprocess import Process
# _threading_Thread.setDaemon = _setDaemon
#
#

4
vendor/mo_threads/lock.py поставляемый
Просмотреть файл

@ -49,7 +49,7 @@ def _late_import():
class Lock(object):
"""
A NON-RE-ENTRANT LOCK WITH wait() AND
A NON-RE-ENTRANT LOCK WITH wait()
"""
__slots__ = ["name", "lock", "waiting"]
@ -77,7 +77,7 @@ class Lock(object):
"""
THE ASSUMPTION IS wait() WILL ALWAYS RETURN WITH THE LOCK ACQUIRED
:param till: WHEN TO GIVE UP WAITING FOR ANOTHER THREAD TO SIGNAL
:return: True IF SIGNALED TO GO, False IF TIMEOUT HAPPENED
:return: True IF SIGNALED TO GO, False IF till WAS SIGNALED
"""
waiter = Signal()
if self.waiting:

Просмотреть файл

@ -16,26 +16,12 @@ import pstats
from datetime import datetime
from time import clock
from mo_dots import Data
from mo_dots import wrap
from mo_dots import Data, wrap, Null
from mo_logs import Log
ON = False
profiles = {}
_Log = None
def _late_import():
global _Log
from mo_logs import Log as _Log
from mo_threads import Queue
if _Log.cprofiler_stats == None:
_Log.cprofiler_stats = Queue("cprofiler stats") # ACCUMULATION OF STATS FROM ALL THREADS
class Profiler(object):
"""
VERY SIMPLE PROFILER FOR USE IN with STATEMENTS
@ -48,13 +34,12 @@ class Profiler(object):
output = profiles.get(args[0])
if output:
return output
output = object.__new__(cls, *args)
output = object.__new__(cls)
return output
def __init__(self, description):
from jx_python.windows import Stats
if ON and not hasattr(self, "description"):
from jx_python.windows import Stats
self.description = description
self.samples = []
self.stats = Stats()()
@ -127,20 +112,25 @@ class CProfiler(object):
"""
def __init__(self):
if not _Log:
_late_import()
self.cprofiler = None
def __enter__(self):
if _Log.cprofiler:
_Log.note("starting cprofile")
if Log.cprofiler:
Log.note("starting cprofile")
self.cprofiler = cProfile.Profile()
self.cprofiler.enable()
def __exit__(self, exc_type, exc_val, exc_tb):
if self.cprofiler:
if self.cprofiler is not None:
self.cprofiler.disable()
_Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
del self.cprofiler
_Log.note("done cprofile")
Log.note("done cprofile")
def enable(self):
if self.cprofiler is not None:
return self.cprofiler.enable()
def disable(self):
if self.cprofiler is not None:
return self.cprofiler.disable()

78
vendor/mo_threads/queues.py поставляемый
Просмотреть файл

@ -22,14 +22,12 @@ from datetime import datetime
from time import time
from mo_dots import coalesce, Null
from mo_threads import Lock, Signal, Thread, THREAD_STOP, THREAD_TIMEOUT, Till
from mo_logs import Log
from mo_threads.lock import Lock
from mo_threads.signal import Signal
from mo_threads.threads import THREAD_STOP, THREAD_TIMEOUT, Thread
from mo_threads.till import Till
_convert = None
_Except = None
_CProfiler = None
_Log = None
DEBUG = False
# MAX_DATETIME = datetime(2286, 11, 20, 17, 46, 39)
@ -37,23 +35,6 @@ DEFAULT_WAIT_TIME = 10 * 60 # SECONDS
datetime.strptime('2012-01-01', '%Y-%m-%d') # http://bugs.python.org/issue7980
def _late_import():
global _convert
global _Except
global _CProfiler
global _Log
from mo_logs.exceptions import Except as _Except
from mo_logs.profiles import CProfiler as _CProfiler
from mo_logs import Log as _Log
_ = _convert
_ = _Except
_ = _CProfiler
_ = _Log
class Queue(object):
"""
SIMPLE MESSAGE QUEUE, multiprocessing.Queue REQUIRES SERIALIZATION, WHICH
@ -66,9 +47,6 @@ class Queue(object):
silent - COMPLAIN IF THE READERS ARE TOO SLOW
unique - SET True IF YOU WANT ONLY ONE INSTANCE IN THE QUEUE AT A TIME
"""
if not _Log:
_late_import()
self.name = name
self.max = coalesce(max, 2 ** 10)
self.silent = silent
@ -88,10 +66,10 @@ class Queue(object):
if value is not None:
yield value
except Exception as e:
_Log.warning("Tell me about what happened here", e)
Log.warning("Tell me about what happened here", e)
if not self.silent:
_Log.note("queue iterator is done")
Log.note("queue iterator is done")
def add(self, value, timeout=None):
with self.lock:
@ -103,7 +81,7 @@ class Queue(object):
self._wait_for_queue_space(timeout=timeout)
if self.please_stop and not self.allow_add_after_close:
_Log.error("Do not add to closed queue")
Log.error("Do not add to closed queue")
else:
if self.unique:
if value not in self.queue:
@ -117,7 +95,7 @@ class Queue(object):
SNEAK value TO FRONT OF THE QUEUE
"""
if self.please_stop and not self.allow_add_after_close:
_Log.error("Do not push to closed queue")
Log.error("Do not push to closed queue")
with self.lock:
self._wait_for_queue_space()
@ -132,12 +110,12 @@ class Queue(object):
"""
if till is not None and not isinstance(till, Signal):
_Log.error("Expecting a signal")
Log.error("Expecting a signal")
return Null, self.pop(till=till)
def extend(self, values):
if self.please_stop and not self.allow_add_after_close:
_Log.error("Do not push to closed queue")
Log.error("Do not push to closed queue")
with self.lock:
# ONCE THE queue IS BELOW LIMIT, ALLOW ADDING MORE
@ -171,16 +149,16 @@ class Queue(object):
if timeout != None:
time_to_stop_waiting = now + timeout
else:
time_to_stop_waiting = Null
time_to_stop_waiting = None
if self.next_warning < now:
self.next_warning = now + wait_time
while not self.please_stop and len(self.queue) >= self.max:
if now > time_to_stop_waiting:
if not _Log:
if not Log:
_late_import()
_Log.error(THREAD_TIMEOUT)
Log.error(THREAD_TIMEOUT)
if self.silent:
self.lock.wait(Till(till=time_to_stop_waiting))
@ -190,7 +168,7 @@ class Queue(object):
now = time()
if self.next_warning < now:
self.next_warning = now + wait_time
_Log.alert(
Log.alert(
"Queue by name of {{name|quote}} is full with ({{num}} items), thread(s) have been waiting {{wait_time}} sec",
name=self.name,
num=len(self.queue),
@ -215,7 +193,7 @@ class Queue(object):
:return: A value, or a THREAD_STOP or None
"""
if till is not None and not isinstance(till, Signal):
_Log.error("expecting a signal")
Log.error("expecting a signal")
with self.lock:
while True:
@ -229,7 +207,7 @@ class Queue(object):
break
return None
if DEBUG or not self.silent:
_Log.note(self.name + " queue stopped")
Log.note(self.name + " queue stopped")
return THREAD_STOP
def pop_all(self):
@ -289,13 +267,13 @@ class ThreadedQueue(Queue):
# BE CAREFUL! THE THREAD MAKING THE CALL WILL NOT BE YOUR OWN!
# DEFAULT BEHAVIOUR: THIS WILL KEEP RETRYING WITH WARNINGS
):
if not _Log:
if not Log:
_late_import()
if period !=None and not isinstance(period, (int, float, long)):
if not _Log:
if not Log:
_late_import()
_Log.error("Expecting a float for the period")
Log.error("Expecting a float for the period")
batch_size = coalesce(batch_size, int(max_size / 2) if max_size else None, 900)
max_size = coalesce(max_size, batch_size * 2) # REASONABLE DEFAULT
@ -328,7 +306,7 @@ class ThreadedQueue(Queue):
item = self.pop()
now = time()
if now > last_push + period:
# _Log.note("delay next push")
# Log.note("delay next push")
next_push = Till(till=now + period)
else:
item = self.pop(till=next_push)
@ -349,13 +327,13 @@ class ThreadedQueue(Queue):
try:
error_target(e, _buffer)
except Exception as f:
_Log.warning(
Log.warning(
"`error_target` should not throw, just deal",
name=name,
cause=f
)
else:
_Log.warning(
Log.warning(
"Unexpected problem",
name=name,
cause=e
@ -374,13 +352,13 @@ class ThreadedQueue(Queue):
try:
error_target(e, _buffer)
except Exception as f:
_Log.warning(
Log.warning(
"`error_target` should not throw, just deal",
name=name,
cause=f
)
else:
_Log.warning(
Log.warning(
"Problem with {{name}} pushing {{num}} items to data sink",
name=name,
num=len(_buffer),
@ -405,8 +383,8 @@ class ThreadedQueue(Queue):
# from jx_python import jx
#
# biggest = jx.sort(sizes, "size").last().id
# _Log.note("Big record {{id}}", id=biggest)
# _Log.note("{{name}} has {{num}} items with json size of {{size|comma}}", name=self.name, num=len(self.queue), size=size)
# Log.note("Big record {{id}}", id=biggest)
# Log.note("{{name}} has {{num}} items with json size of {{size|comma}}", name=self.name, num=len(self.queue), size=size)
return self
def extend(self, values):
@ -415,7 +393,7 @@ class ThreadedQueue(Queue):
self._wait_for_queue_space()
if not self.please_stop:
self.queue.extend(values)
_Log.note("{{name}} has {{num}} items", name=self.name, num=len(self.queue))
Log.note("{{name}} has {{num}} items", name=self.name, num=len(self.queue))
return self
def __enter__(self):
@ -430,3 +408,5 @@ class ThreadedQueue(Queue):
def stop(self):
self.add(THREAD_STOP)
self.thread.join()

150
vendor/mo_threads/threads.py поставляемый
Просмотреть файл

@ -15,20 +15,19 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import signal as _signal
import sys
from copy import copy
from datetime import datetime, timedelta
from time import sleep
from mo_future import get_ident, start_new_thread, interrupt_main
from mo_dots import Data, unwraplist, Null
from mo_future import get_ident, start_new_thread, interrupt_main, get_function_name, text_type
from mo_logs import Log, Except
from mo_logs.profiles import CProfiler
from mo_threads import Till, Lock, Signal, till
from mo_threads.signal import AndSignals
from mo_threads.lock import Lock
from mo_threads.profiles import CProfiler
from mo_threads.signal import AndSignals, Signal
from mo_threads.till import Till
DEBUG = False
@ -81,8 +80,11 @@ class MainThread(object):
def __init__(self):
self.name = "Main Thread"
self.id = get_ident()
self.please_stop = Signal()
self.children = []
self.stop_logging = Log.stop
self.timers = None
self.cprofiler = Null
def add_child(self, child):
self.children.append(child)
@ -96,9 +98,15 @@ class MainThread(object):
def stop(self):
"""
BLOCKS UNTIL ALL THREADS HAVE STOPPED
THEN RUNS sys.exit(0)
"""
join_errors = []
self.please_stop.go()
self_thread = Thread.current()
if self_thread != MAIN_THREAD or self_thread != self:
Log.error("Only the main thread can call stop() on main thread")
join_errors = []
children = copy(self.children)
for c in reversed(children):
if DEBUG and c.name:
@ -122,11 +130,57 @@ class MainThread(object):
if join_errors:
Log.error("Problem while stopping {{name|quote}}", name=self.name, cause=unwraplist(join_errors))
self.stop_logging()
self.timers.stop()
self.timers.join()
if DEBUG:
Log.note("Thread {{name|quote}} now stopped", name=self.name)
sys.exit(0)
def wait_for_shutdown_signal(
self,
please_stop=False, # ASSIGN SIGNAL TO STOP EARLY
allow_exit=False, # ALLOW "exit" COMMAND ON CONSOLE TO ALSO STOP THE APP
wait_forever=True # IGNORE CHILD THREADS, NEVER EXIT. False => IF NO CHILD THREADS LEFT, THEN EXIT
):
"""
FOR USE BY PROCESSES THAT NEVER DIE UNLESS EXTERNAL SHUTDOWN IS REQUESTED
CALLING THREAD WILL SLEEP UNTIL keyboard interrupt, OR please_stop, OR "exit"
:param please_stop:
:param allow_exit:
:param wait_forever:: Assume all needed threads have been launched. When done
:return:
"""
self_thread = Thread.current()
if self_thread != MAIN_THREAD or self_thread != self:
Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
if isinstance(please_stop, Signal):
self.please_stop.on_go(please_stop.go)
else:
please_stop = self.please_stop
if not wait_forever:
# TRIGGER SIGNAL WHEN ALL CHILDREN THEADS ARE DONE
pending = copy(self_thread.children)
all = AndSignals(please_stop, len(pending))
for p in pending:
p.stopped.on_go(all.done)
try:
if allow_exit:
_wait_for_exit(please_stop)
else:
_wait_for_interrupt(please_stop)
except KeyboardInterrupt as _:
Log.alert("SIGINT Detected! Stopping...")
except SystemExit as _:
Log.alert("SIGTERM Detected! Stopping...")
finally:
self.stop()
class Thread(object):
@ -152,7 +206,7 @@ class Thread(object):
self.thread = None
self.stopped = Signal("stopped signal for " + self.name)
self.cprofiler = None
self.cprofiler = Null
self.children = []
if "parent_thread" in kwargs:
@ -162,7 +216,6 @@ class Thread(object):
self.parent = Thread.current()
self.parent.add_child(self)
def __enter__(self):
return self
@ -210,7 +263,8 @@ class Thread(object):
try:
if self.target is not None:
a, k, self.args, self.kwargs = self.args, self.kwargs, None, None
with CProfiler(): # PROFILE IN HERE SO THAT __exit__() IS RUN BEFORE THREAD MARKED AS stopped
self.cprofiler = CProfiler()
with self.cprofiler: # PROFILE IN HERE SO THAT __exit__() IS RUN BEFORE THREAD MARKED AS stopped
response = self.target(*a, **k)
with self.synch_lock:
self.end_of_thread = Data(response=response)
@ -226,7 +280,7 @@ class Thread(object):
try:
Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
except Exception:
sys.stderr.write(b"ERROR in thread: " + str(self.name) + b" " + str(e) + b"\n")
sys.stderr.write(str("ERROR in thread: " + self.name + " " + text_type(e) + "\n"))
finally:
try:
children = copy(self.children)
@ -260,9 +314,9 @@ class Thread(object):
if DEBUG:
Log.warning("problem with thread {{name|quote}}", cause=e, name=self.name)
finally:
self.stopped.go()
if DEBUG:
Log.note("thread {{name|quote}} is done", name=self.name)
self.stopped.go()
def is_alive(self):
return not self.stopped
@ -293,7 +347,9 @@ class Thread(object):
@staticmethod
def run(name, target, *args, **kwargs):
# ENSURE target HAS please_stop ARGUMENT
if "please_stop" not in target.__code__.co_varnames:
if get_function_name(target) == 'wrapper':
pass # GIVE THE override DECORATOR A PASS
elif "please_stop" not in target.__code__.co_varnames:
Log.error("function must have please_stop argument for signalling emergency shutdown")
Thread.num_threads += 1
@ -302,48 +358,6 @@ class Thread(object):
output.start()
return output
@staticmethod
def wait_for_shutdown_signal(
please_stop=False, # ASSIGN SIGNAL TO STOP EARLY
allow_exit=False, # ALLOW "exit" COMMAND ON CONSOLE TO ALSO STOP THE APP
wait_forever=True # IGNORE CHILD THREADS, NEVER EXIT. False -> IF NO CHILD THREADS LEFT, THEN EXIT
):
"""
FOR USE BY PROCESSES NOT EXPECTED TO EVER COMPLETE UNTIL EXTERNAL
SHUTDOWN IS REQUESTED
SLEEP UNTIL keyboard interrupt, OR please_stop, OR "exit"
:param please_stop:
:param allow_exit:
:param wait_forever:: Assume all needed threads have been launched. When done
:return:
"""
if not isinstance(please_stop, Signal):
please_stop = Signal()
please_stop.on_go(lambda: start_new_thread(_stop_main_thread, ()))
self_thread = Thread.current()
if self_thread != MAIN_THREAD:
Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
if not wait_forever:
# TRIGGER SIGNAL WHEN ALL EXITING THREADS ARE DONE
pending = copy(self_thread.children)
all = AndSignals(please_stop, len(pending))
for p in pending:
p.stopped.on_go(all.done)
try:
if allow_exit:
_wait_for_exit(please_stop)
else:
_wait_for_interrupt(please_stop)
except (KeyboardInterrupt, SystemExit) as _:
Log.alert("SIGINT Detected! Stopping...")
finally:
please_stop.go()
@staticmethod
def current():
@ -355,15 +369,26 @@ class Thread(object):
return MAIN_THREAD
def _stop_main_thread():
def stop_main_thread(*args):
global DEBUG
DEBUG = True
try:
if len(args):
Log.warning("exit with {{value}}", value=_describe_exit_codes.get(args[0], args[0]))
except Exception as _:
pass
finally:
MAIN_THREAD.stop()
except Exception as e:
e = Except.wrap(e)
Log.warning("Problem with threads", cause=e)
sys.exit(0)
_describe_exit_codes = {
_signal.SIGTERM: "SIGTERM",
_signal.SIGINT: "SIGINT"
}
_signal.signal(_signal.SIGTERM, stop_main_thread)
_signal.signal(_signal.SIGINT, stop_main_thread)
def _wait_for_exit(please_stop):
@ -416,11 +441,10 @@ def _interrupt_main_safely():
# WE COULD BE INTERRUPTING SELF
pass
MAIN_THREAD = MainThread()
ALL_LOCK = Lock("threads ALL_LOCK")
ALL = dict()
ALL[get_ident()] = MAIN_THREAD
MAIN_THREAD.timers = Thread.run("timers daemon", till.daemon)
MAIN_THREAD.children.remove(MAIN_THREAD.timers)

32
vendor/mo_threads/till.py поставляемый
Просмотреть файл

@ -15,9 +15,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from mo_future import allocate_lock as _allocate_lock
from time import sleep, time
from weakref import ref
from mo_future import allocate_lock as _allocate_lock
from mo_future import text_type
from mo_threads.signal import Signal
@ -40,7 +41,7 @@ class Till(Signal):
if not Till.enabled:
return Till.done
elif till == None and timeout == None and seconds == None:
return Till.done
return None
else:
return object.__new__(cls)
@ -70,7 +71,7 @@ class Till(Signal):
with Till.locker:
if timeout != None:
Till.next_ping = min(Till.next_ping, timeout)
Till.new_timers.append((timeout, self))
Till.new_timers.append((timeout, ref(self)))
Till.done.go()
@ -108,13 +109,17 @@ def daemon(please_stop):
new_timers, Till.new_timers = Till.new_timers, []
if DEBUG and new_timers:
Log.note("new timers: {{timers}}", timers=[t for t, s in new_timers])
if len(new_timers) > 5:
Log.note("{{num}} new timers", num=len(new_timers))
else:
Log.note("new timers: {{timers}}", timers=[t for t, _ in new_timers])
sorted_timers.extend(new_timers)
if sorted_timers:
sorted_timers.sort(key=lambda r: r[0])
for i, (t, s) in enumerate(sorted_timers):
sorted_timers.sort(key=actual_time)
for i, rec in enumerate(sorted_timers):
t = actual_time(rec)
if now < t:
work, sorted_timers = sorted_timers[:i], sorted_timers[i:]
Till.next_ping = min(Till.next_ping, sorted_timers[0][0])
@ -126,15 +131,17 @@ def daemon(please_stop):
if DEBUG:
Log.note(
"done: {{timers}}. Remaining {{pending}}",
timers=[t for t, s in work],
pending=[t for t, s in sorted_timers]
timers=[t for t, s in work] if len(work) <= 5 else len(work),
pending=[t for t, s in sorted_timers] if len(sorted_timers) <= 5 else len(sorted_timers)
)
for t, s in work:
s.go()
for t, r in work:
s = r()
if s is not None:
s.go()
except Exception as e:
Log.warning("timer shutdown", cause=e)
Log.warning("unexpected timer shutdown", cause=e)
finally:
if DEBUG:
Log.alert("TIMER SHUTDOWN")
@ -145,4 +152,5 @@ def daemon(please_stop):
for t, s in new_work + sorted_timers:
s.go()
def actual_time(rec):
return 0 if rec[1]() is None else rec[0]

12
vendor/mo_times/dates.py поставляемый
Просмотреть файл

@ -71,7 +71,7 @@ class Date(object):
def format(self, format="%Y-%m-%d %H:%M:%S"):
try:
return unix2datetime(self.unix).strftime(format)
return text_type(unix2datetime(self.unix).strftime(format))
except Exception as e:
from mo_logs import Log
@ -160,11 +160,15 @@ class Date(object):
return self.add(-other)
def __lt__(self, other):
other = Date(other)
try:
other = Date(other)
except Exception:
return False
return self.unix < other.unix
def __eq__(self, other):
if other == None:
if other == None or other == '':
return Null
try:
@ -397,7 +401,7 @@ def unicode2Date(value, format=None):
else:
from mo_logs import Log
Log.error("Can not interpret {{value}} as a datetime", value= value)
Log.error("Can not interpret {{value}} as a datetime", value=value)
DATETIME_EPOCH = datetime(1970, 1, 1)

12
vendor/mo_times/timer.py поставляемый
Просмотреть файл

@ -37,6 +37,7 @@ class Timer(object):
self.param = wrap(coalesce(param, {}))
self.debug = debug
self.silent = silent
self.agg = 0
self.start = 0
self.end = 0
self.interval = None
@ -51,6 +52,7 @@ class Timer(object):
def __exit__(self, type, value, traceback):
self.end = time()
self.interval = self.end - self.start
self.agg += self.interval
if self.debug:
param = wrap(self.param)
@ -60,7 +62,15 @@ class Timer(object):
@property
def duration(self):
end = time()
if not self.end:
return Duration(time() - self.start)
return Duration(end - self.start)
return Duration(self.interval)
@property
def total(self):
if not self.end:
Log.error("please ask for total time outside the context of measuring")
return Duration(self.agg)

7
vendor/pyLibrary/aws/s3.py поставляемый
Просмотреть файл

@ -11,17 +11,16 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import StringIO
import gzip
import zipfile
from tempfile import TemporaryFile
import boto
from BeautifulSoup import BeautifulSoup
from boto.s3.connection import Location
from mo_future import text_type
from bs4 import BeautifulSoup
from mo_dots import wrap, Null, coalesce, unwrap, Data
from mo_future import text_type, StringIO
from mo_kwargs import override
from mo_logs import Log, Except
from mo_logs.strings import utf82unicode, unicode2utf8
@ -472,7 +471,7 @@ def strip_extension(key):
def _unzip(compressed):
buff = StringIO.StringIO(compressed)
buff = StringIO(compressed)
archive = zipfile.ZipFile(buff, mode='r')
return archive.read(archive.namelist()[0])

9
vendor/pyLibrary/convert.py поставляемый
Просмотреть файл

@ -46,6 +46,15 @@ def string2datetime(value, format=None):
return unix2datetime(Date(value, format).unix)
def string2boolean(value):
if value in ["true", "T"]:
return True
elif value in ["false", "F"]:
return False
else:
return None
def str2datetime(value, format=None):
return unix2datetime(Date(value, format).unix)

347
vendor/pyLibrary/env/elasticsearch.py поставляемый
Просмотреть файл

@ -29,7 +29,7 @@ from mo_logs.strings import utf82unicode, unicode2utf8
from mo_math import Math
from mo_math.randoms import Random
from mo_threads import Lock, ThreadedQueue, Till
from mo_times import Date, Timer
from mo_times import Date, Timer, MINUTE
from pyLibrary import convert
from pyLibrary.env import http
@ -38,6 +38,8 @@ ES_NUMERIC_TYPES = ["long", "integer", "double", "float"]
ES_PRIMITIVE_TYPES = ["string", "boolean", "integer", "date", "long", "double"]
INDEX_DATE_FORMAT = "%Y%m%d_%H%M%S"
STALE_METADATA = 10 * MINUTE
DATA_KEY = text_type("data")
@ -85,7 +87,7 @@ class Index(Features):
self.cluster = cluster or Cluster(kwargs)
try:
full_index = self.get_index(index)
full_index = self.cluster.get_canonical_index(index)
if full_index and alias==None:
kwargs.alias = kwargs.index
kwargs.index = full_index
@ -93,41 +95,40 @@ class Index(Features):
Log.error("not allowed")
if type == None:
# NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT?
index_ = self.cluster.get_metadata().indices[self.settings.index]
if not index_:
Log.error("can not find index {{index}}", index=self.settings.index)
about = self.cluster.get_metadata().indices[self.settings.index]
type = self.settings.type = _get_best_type_from_mapping(about.mappings)[0]
if type == "_default_":
Log.error("not allowed")
if not type:
Log.error("not allowed")
candidate_types = list(index_.mappings.keys())
if len(candidate_types) != 1:
Log.error("Expecting `type` parameter")
self.settings.type = type = candidate_types[0]
self.path = "/" + full_index + "/" + type
except Exception as e:
# EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
Log.error("not expected", cause=e)
if not type:
Log.error("not allowed")
self.path = "/" + full_index + "/" + type
if self.debug:
Log.alert("elasticsearch debugging for {{url}} is on", url=self.url)
if tjson:
from pyLibrary.env.typed_inserter import TypedInserter
self.encode = TypedInserter(self, id_column).typed_encode
props = self.get_properties()
if not props:
tjson = coalesce(kwargs.tjson, True) # TYPED JSON IS DEFAULT
elif props[EXISTS_TYPE]:
if tjson is False:
Log.error("expecting tjson parameter to match properties of {{index}}", index=index)
elif tjson == None:
tjson = kwargs.tjson = True
else:
if tjson == None and not read_only:
props = self.get_properties()
if props[EXISTS_TYPE]:
kwargs.tjson=True
from pyLibrary.env.typed_inserter import TypedInserter
self.encode = TypedInserter(self, id_column).typed_encode
else:
kwargs.tjson = False
Log.warning("{{index}} is not typed tjson={{tjson}}", index=self.settings.index, tjson=self.settings.tjson)
self.encode = get_encoder(id_column)
if tjson is True:
Log.error("expecting tjson parameter to match properties of {{index}}", index=index)
elif tjson == None:
tjson = kwargs.tjson = False
if not read_only:
if tjson:
from pyLibrary.env.typed_inserter import TypedInserter
self.encode = TypedInserter(self, id_column).typed_encode
else:
self.encode = get_encoder(id_column)
@ -145,12 +146,12 @@ class Index(Features):
self.cluster.info = None
return self.get_properties(retry=False)
if not index.mappings[self.settings.type]:
if not index.mappings[self.settings.type] and (index.mappings.keys()-{"_default_"}):
Log.warning(
"ElasticSearch index {{index|quote}} does not have type {{type|quote}} in {{metadata|json}}",
index=self.settings.index,
type=self.settings.type,
metadata=jx.sort(metadata.indices.keys())
metadata=jx.sort(index.mappings.keys())
)
return Null
return index.mappings[self.settings.type].properties
@ -195,35 +196,12 @@ class Index(Features):
# WAIT FOR ALIAS TO APPEAR
while True:
response = self.cluster.get("/_cluster/state", retry={"times": 5}, timeout=3)
if alias in response.metadata.indices[self.settings.index].aliases:
metadata = self.cluster.get_metadata(force=True)
if alias in metadata.indices[self.settings.index].aliases:
return
Log.note("Waiting for alias {{alias}} to appear", alias=alias)
Till(seconds=1).wait()
def get_index(self, alias):
"""
RETURN THE INDEX USED BY THIS alias
"""
alias_list = self.cluster.get_aliases()
output = jx.sort(set([
a.index
for a in alias_list
if a.alias == alias or
a.index == alias or
(re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias)
]))
if len(output) > 1:
Log.error("only one index with given alias==\"{{alias}}\" expected", alias= alias)
if not output:
return Null
return output.last()
def is_proto(self, index):
"""
RETURN True IF THIS INDEX HAS NOT BEEN ASSIGNED ITS ALIAS
@ -306,8 +284,6 @@ class Index(Features):
else:
raise NotImplementedError
def extend(self, records):
"""
records - MUST HAVE FORM OF
@ -407,6 +383,22 @@ class Index(Features):
Log.error("add() has changed to only accept one record, no lists")
self.extend([record])
def add_property(self, name, details):
if self.debug:
Log.note("Adding property {{prop}} to {{index}}", prop=name, index=self.settings.index)
for n in jx.reverse(split_field(name)):
if n == NESTED_TYPE:
details = {"properties": {n: set_default(details, {"type": "nested", "dynamic": True})}}
elif n.startswith(TYPE_PREFIX):
details = {"properties": {n: details}}
else:
details = {"properties": {n: set_default(details, {"type": "object", "dynamic": True})}}
self.cluster.put(
"/" + self.settings.index + "/_mapping/" + self.settings.type,
data=details
)
def refresh(self):
self.cluster.post("/" + self.settings.index + "/_refresh")
@ -436,7 +428,7 @@ class Index(Features):
elif self.cluster.version.startswith(("1.4.", "1.5.", "1.6.", "1.7.", "5.", "6.")):
result = self.cluster.put(
"/" + self.settings.index + "/_settings",
data='{"index":{"refresh_interval":' + value2json(interval) + '}}',
data={"index": {"refresh_interval": interval}},
**kwargs
)
@ -532,7 +524,7 @@ class Cluster(object):
return cluster
@override
def __init__(self, host, port=9200, explore_metadata=True, kwargs=None):
def __init__(self, host, port=9200, explore_metadata=True, debug=False, kwargs=None):
"""
settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
@ -542,12 +534,13 @@ class Cluster(object):
self.settings = kwargs
self.info = None
self._metadata = None
self._metadata = Null
self.index_new_since = {} # MAP FROM INDEX NAME TO TIME THE INDEX METADATA HAS CHANGED
self.metadata_locker = Lock()
self.debug = kwargs.debug
self.version = None
self.last_metadata = Date.now()
self.debug = debug
self._version = None
self.path = kwargs.host + ":" + text_type(kwargs.port)
self.get_metadata()
@override
def get_or_create_index(
@ -560,7 +553,7 @@ class Cluster(object):
tjson=None,
kwargs=None
):
best = self._get_best(kwargs)
best = self.get_best_matching_index(index, alias)
if not best:
output = self.create_index(kwargs=kwargs, schema=schema, limit_replicas=limit_replicas)
return output
@ -573,39 +566,29 @@ class Cluster(object):
index = kwargs.index
meta = self.get_metadata()
columns = parse_properties(index, ".", meta.indices[index].mappings.values()[0].properties)
type, about = _get_best_type_from_mapping(meta.indices[index].mappings)
tjson = kwargs.tjson
if len(columns) != 0:
kwargs.tjson = tjson or any(
c.names["."].startswith(TYPE_PREFIX) or
c.names["."].find("." + TYPE_PREFIX) != -1
for c in columns
)
if tjson is None and not kwargs.tjson:
Log.warning("Not typed index, columns are:\n{{columns|json}}", columns=columns)
if tjson == None:
tjson = True
columns = parse_properties(index, ".", about.properties)
if len(columns) > 0:
tjson = any(
c.names["."].startswith(TYPE_PREFIX) or
c.names["."].find("." + TYPE_PREFIX) != -1
for c in columns
)
kwargs.tjson = tjson
return Index(kwargs=kwargs, cluster=self)
def _get_best(self, settings):
aliases = self.get_aliases()
indexes = jx.sort([
a
for a in aliases
if (a.alias == settings.index and settings.alias == None) or
(re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or
(a.index == settings.index and (settings.alias == None or a.alias == None or a.alias == settings.alias))
], "index")
return indexes.last()
@override
def get_index(self, index, type=None, alias=None, tjson=None, read_only=True, kwargs=None):
def get_index(self, index, type, alias=None, tjson=None, read_only=True, kwargs=None):
"""
TESTS THAT THE INDEX EXISTS BEFORE RETURNING A HANDLE
"""
if read_only:
# GET EXACT MATCH, OR ALIAS
aliases = self.get_aliases()
aliases = wrap(self.get_aliases())
if index in aliases.index:
pass
elif index in aliases.alias:
@ -617,7 +600,7 @@ class Cluster(object):
return Index(kwargs=kwargs, cluster=self)
else:
# GET BEST MATCH, INCLUDING PROTOTYPE
best = self._get_best(kwargs)
best = self.get_best_matching_index(index, alias)
if not best:
Log.error("Can not find index {{index_name}}", index_name=kwargs.index)
@ -643,6 +626,42 @@ class Cluster(object):
return Index(read_only=True, kwargs=settings, cluster=self)
Log.error("Can not find any index with alias {{alias_name}}", alias_name= alias)
def get_canonical_index(self, alias):
"""
RETURN THE INDEX USED BY THIS alias
THIS IS ACCORDING TO THE STRICT LIFECYCLE RULES:
THERE IS ONLY ONE INDEX WITH AN ALIAS
"""
output = jx.sort(set(
i
for ai in self.get_aliases()
for a, i in [(ai.alias, ai.index)]
if a == alias or i == alias or (re.match(re.escape(alias) + "\\d{8}_\\d{6}", i) and i != alias)
))
if len(output) > 1:
Log.error("only one index with given alias==\"{{alias}}\" expected", alias=alias)
if not output:
return Null
return output.last()
def get_best_matching_index(self, index, alias=None):
indexes = jx.sort(
[
ai_pair
for pattern in [re.escape(index) + r'\d{8}_\d{6}']
for ai_pair in self.get_aliases()
for a, i in [(ai_pair.alias, ai_pair.index)]
if (a == index and alias == None) or
(re.match(pattern, i) and alias == None) or
(i == index and (alias == None or a == None or a == alias))
],
"index"
)
return indexes.last()
def get_prototype(self, alias):
"""
RETURN ALL INDEXES THAT ARE INTENDED TO BE GIVEN alias, BUT HAVE NO
@ -698,11 +717,13 @@ class Cluster(object):
Log.error("Expecting a JSON schema")
for k, m in list(schema.mappings.items()):
m.date_detection = False # DISABLE DATE DETECTION
if tjson:
schema.mappings[k] = add_typed_annotations(m)
m = schema.mappings[k] = wrap(add_typed_annotations(m))
m = wrap(schema.mappings[k])
schema.mappings[k].date_detection = False # DISABLE DATE DETECTION
m.date_detection = False # DISABLE DATE DETECTION
m.dynamic_templates = (
DEFAULT_DYNAMIC_TEMPLATES +
m.dynamic_templates
@ -737,11 +758,10 @@ class Cluster(object):
)
# CONFIRM INDEX EXISTS
while True:
while not Till(seconds=30):
try:
state = self.get("/_cluster/state", retry={"times": 5}, timeout=3, stream=False)
if index in state.metadata.indices:
self._metadata = None
metadata = self.get_metadata(force=True)
if index in metadata.indices:
break
Log.note("Waiting for index {{index}} to appear", index=index)
except Exception as e:
@ -784,37 +804,50 @@ class Cluster(object):
RETURN LIST OF {"alias":a, "index":i} PAIRS
ALL INDEXES INCLUDED, EVEN IF NO ALIAS {"alias":Null}
"""
data = self.get("/_aliases", retry={"times": 5}, timeout=3, stream=False)
output = []
for index, desc in data.items():
for index, desc in self.get_metadata().indices.items():
if not desc["aliases"]:
output.append({"index": index, "alias": None})
yield wrap({"index": index})
elif desc['aliases'][0] == index:
Log.error("should not happen")
else:
for a in desc["aliases"]:
output.append({"index": index, "alias": a})
return wrap(output)
yield wrap({"index": index, "alias": a})
def get_metadata(self, force=False):
if not self.settings.explore_metadata:
Log.error("Metadata exploration has been disabled")
if not self._metadata or force:
response = self.get("/_cluster/state", retry={"times": 3}, timeout=30, stream=False)
with self.metadata_locker:
self._metadata = wrap(response.metadata)
# REPLICATE MAPPING OVER ALL ALIASES
indices = self._metadata.indices
for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}):
m.index = i
for a in m.aliases:
if not indices[a]:
indices[a] = m
self.info = wrap(self.get("/", stream=False))
self.version = self.info.version.number
if not force and self._metadata and Date.now() < self.last_metadata + STALE_METADATA:
return self._metadata
old_indices = self._metadata.indices
response = self.get("/_cluster/state", retry={"times": 3}, timeout=30, stream=False)
now = self.last_metadata = Date.now()
with self.metadata_locker:
self._metadata = wrap(response.metadata)
for new_index_name, new_meta in self._metadata.indices.items():
old_index = old_indices[new_index_name]
if not old_index:
self.index_new_since[new_index_name] = now
else:
for type_name, new_about in new_meta.mappings.items():
old_about = old_index.mappings[type_name]
diff = diff_schema(new_about.properties, old_about.properties)
if diff:
self.index_new_since[new_index_name] = now
for old_index_name, old_meta in old_indices.items():
new_index = self._metadata.indices[old_index_name]
if not new_index:
self.index_new_since[old_index_name] = now
self.info = wrap(self.get("/", stream=False))
self._version = self.info.version.number
return self._metadata
@property
def version(self):
if self._version is None:
self.get_metadata()
return self._version
def post(self, path, **kwargs):
url = self.settings.host + ":" + text_type(self.settings.port) + path
@ -841,7 +874,7 @@ class Cluster(object):
Log.note("POST {{url}}", url=url)
response = http.post(url, **kwargs)
if response.status_code not in [200, 201]:
Log.error(response.reason.decode("latin1") + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000))
Log.error(text_type(response.reason) + ": " + strings.limit(response.content.decode("latin1"), 100 if self.debug else 10000))
if self.debug:
Log.note("response: {{response}}", response=utf82unicode(response.content)[:130])
details = json2value(utf82unicode(response.content))
@ -1058,16 +1091,7 @@ class Alias(Features):
mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]
# FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
max_prop = -1
for _type, mapping in mappings.mappings.items():
if _type == "_default_":
continue
num_prop = len(mapping.properties.keys())
if max_prop < num_prop:
max_prop = num_prop
self.settings.type = _type
type = _type
type, props = _get_best_type_from_mapping(mappings.mappings)
if type == None:
Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))
@ -1077,7 +1101,7 @@ class Alias(Features):
def url(self):
return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/")
def get_schema(self, retry=True):
def get_snowflake(self, retry=True):
if self.settings.explore_metadata:
indices = self.cluster.get_metadata().indices
if not self.settings.alias or self.settings.alias==self.settings.index:
@ -1186,6 +1210,7 @@ class Alias(Features):
cause=e
)
def parse_properties(parent_index_name, parent_name, esProperties):
"""
RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
@ -1195,8 +1220,6 @@ def parse_properties(parent_index_name, parent_name, esProperties):
index_name = parent_index_name
column_name = concat_field(parent_name, name)
jx_name = column_name
if split_field(column_name)[-1] == EXISTS_TYPE:
property.type = "exists"
if property.type == "nested" and property.properties:
# NESTED TYPE IS A NEW TYPE DEFINITION
@ -1209,7 +1232,7 @@ def parse_properties(parent_index_name, parent_name, esProperties):
es_index=index_name,
es_column=column_name,
names={".": jx_name},
type="nested",
es_type="nested",
nested_path=ROOT_PATH
))
@ -1223,7 +1246,7 @@ def parse_properties(parent_index_name, parent_name, esProperties):
es_index=index_name,
es_column=column_name,
nested_path=ROOT_PATH,
type="source" if property.enabled == False else "object"
es_type="source" if property.enabled == False else "object"
))
if property.dynamic:
@ -1240,7 +1263,7 @@ def parse_properties(parent_index_name, parent_name, esProperties):
es_column=column_name,
names={".": jx_name},
nested_path=ROOT_PATH,
type=property.type
es_type=property.type
))
if property.index_name and name != property.index_name:
columns.append(Column(
@ -1248,7 +1271,7 @@ def parse_properties(parent_index_name, parent_name, esProperties):
es_column=column_name,
names={".": jx_name},
nested_path=ROOT_PATH,
type=property.type
es_type=property.type
))
elif property.enabled == None or property.enabled == False:
columns.append(Column(
@ -1256,7 +1279,7 @@ def parse_properties(parent_index_name, parent_name, esProperties):
es_column=column_name,
names={".": jx_name},
nested_path=ROOT_PATH,
type="source" if property.enabled == False else "object"
es_type="source" if property.enabled == False else "object"
))
else:
Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path)
@ -1264,6 +1287,25 @@ def parse_properties(parent_index_name, parent_name, esProperties):
return columns
def _get_best_type_from_mapping(mapping):
"""
THERE ARE MULTIPLE TYPES IN AN INDEX, PICK THE BEST
:param mapping: THE ES MAPPING DOCUMENT
:return: (type_name, mapping) PAIR (mapping.properties WILL HAVE PROPERTIES
"""
best_type_name = None
best_mapping = None
for k, m in mapping.items():
if k == "_default_":
continue
if best_type_name is None or len(m.properties) > len(best_mapping.properties):
best_type_name = k
best_mapping = m
if best_type_name == None:
return "_default_", mapping["_default_"]
return best_type_name, best_mapping
def get_encoder(id_expression="_id"):
get_id = jx_expression_to_function(id_expression)
@ -1404,18 +1446,44 @@ def add_typed_annotations(meta):
else:
output = {}
for meta_name, meta_value in meta.items():
if meta_name=='properties':
output[meta_name]={
if meta_name == 'properties':
output[meta_name] = {
prop_name: add_typed_annotations(about) if prop_name not in [BOOLEAN_TYPE, NUMBER_TYPE, STRING_TYPE, BOOLEAN_TYPE] else about
for prop_name, about in meta_value.items()
}
output[meta_name][EXISTS_TYPE] = {"type": "long", "store": True}
else:
output[meta_name]=meta_value
output[meta_name] = meta_value
return output
def diff_schema(A, B):
"""
RETURN PROPERTIES IN A, BUT NOT IN B
:param A: elasticsearch properties
:param B: elasticsearch properties
:return: (name, properties) PAIRS WHERE name IS DOT-DELIMITED PATH
"""
output =[]
def _diff_schema(path, A, B):
for k, av in A.items():
bv = B[k]
if bv == None:
output.append((concat_field(path, k), av))
elif av.type == bv.type:
pass # OK
elif (av.type == None and bv.type == 'object') or (av.type == 'object' and bv.type == None):
pass # OK
else:
Log.warning("inconsistent types: {{typeA}} vs {{typeB}}", typeA=av.type, typeB=bv.type)
_diff_schema(concat_field(path, k), av.properties, bv.properties)
# what to do with conflicts?
_diff_schema(".", A, B)
return output
DEFAULT_DYNAMIC_TEMPLATES = wrap([
{
"default_typed_boolean": {
@ -1446,6 +1514,12 @@ DEFAULT_DYNAMIC_TEMPLATES = wrap([
"mapping": {"type": "nested", "store": True},
"match": NESTED_TYPE
}
},
{
"default_string": {
"mapping": {"type": "keyword", "store": True},
"match_mapping_type": "string"
}
}
])
@ -1547,4 +1621,3 @@ _merge_type = {
"nested": "nested"
}
}

7
vendor/pyLibrary/env/flask_wrappers.py поставляемый
Просмотреть файл

@ -11,6 +11,7 @@ from __future__ import division
from __future__ import unicode_literals
import flask
from flask import Response
from mo_dots import coalesce
from mo_future import binary_type
@ -28,10 +29,8 @@ def gzip_wrapper(func, compress_lower_limit=None):
if 'gzip' not in accept_encoding.lower():
return response
resp = response.data
if isinstance(resp, binary_type) and len(resp) > compress_lower_limit:
response.headers['Content-Encoding'] = 'gzip'
response.set_data(b''.join(ibytes2icompressed([resp])))
response.headers['Content-Encoding'] = 'gzip'
response.response = ibytes2icompressed(response.response)
return response

22
vendor/pyLibrary/env/http.py поставляемый
Просмотреть файл

@ -31,7 +31,7 @@ from jx_python import jx
from mo_dots import Data, coalesce, wrap, set_default, unwrap, Null
from mo_future import text_type, PY2
from mo_json import value2json, json2value
from mo_logs import Log
from mo_logs import Log, strings
from mo_logs.strings import utf82unicode, unicode2utf8
from mo_logs.exceptions import Except
from mo_math import Math
@ -157,7 +157,7 @@ def request(method, url, zip=None, retry=None, **kwargs):
try:
if DEBUG:
Log.note(u"http {{method}} to {{url}}", method=method, url=url)
Log.note(u"http {{method|upper}} to {{url}}", method=method, url=text_type(url))
request_count += 1
del kwargs['retry']
@ -221,11 +221,6 @@ def post(url, **kwargs):
return HttpResponse(request('post', url, **kwargs))
def delete(url, **kwargs):
kwargs.setdefault('stream', False)
return HttpResponse(request('delete', url, **kwargs))
def post_json(url, **kwargs):
"""
ASSUME RESPONSE IN IN JSON
@ -238,16 +233,11 @@ def post_json(url, **kwargs):
Log.error(u"Expecting `json` parameter")
response = post(url, **kwargs)
c = response.content
try:
details = json2value(utf82unicode(c))
except Exception as e:
Log.error(u"Unexpected return value {{content}}", content=c, cause=e)
details = json2value(utf82unicode(response.content))
if response.status_code not in [200, 201]:
Log.error(u"Bad response", cause=Except.wrap(details))
return details
Log.error(u"Bad response code {{code}}", code=response.status_code, cause=Except.wrap(details))
else:
return details
def put(url, **kwargs):

13
vendor/pyLibrary/env/typed_inserter.py поставляемый
Просмотреть файл

@ -21,7 +21,7 @@ from jx_base import python_type_to_json_type, INTEGER, NUMBER, EXISTS, NESTED, S
from jx_python.expressions import jx_expression_to_function
from jx_python.meta import Column
from mo_dots import Data, FlatList, NullType, unwrap
from mo_future import text_type, binary_type, utf8_json_encoder, long
from mo_future import text_type, binary_type, utf8_json_encoder, long, sort_using_key
from mo_json import ESCAPE_DCT, float2json, json2value
from mo_json.encoder import problem_serializing, UnicodeBuilder, COMMA, COLON
from mo_json.typed_encoder import encode_property, BOOLEAN_TYPE, NESTED_TYPE, EXISTS_TYPE, STRING_TYPE, NUMBER_TYPE
@ -60,7 +60,7 @@ class TypedInserter(object):
if es:
_schema = Data()
for c in parse_properties(es.settings.alias, ".", es.get_properties()):
if c.type not in (OBJECT, NESTED):
if c.es_type not in (OBJECT, NESTED):
_schema[c.names["."]] = c
self.schema = unwrap(_schema)
else:
@ -127,7 +127,7 @@ class TypedInserter(object):
try:
if isinstance(sub_schema, Column):
value_json_type = python_type_to_json_type[value.__class__]
column_json_type = es_type_to_json_type[sub_schema.type]
column_json_type = es_type_to_json_type[sub_schema.es_type]
if value_json_type == column_json_type:
pass # ok
@ -283,9 +283,6 @@ class TypedInserter(object):
append(_buffer, '}')
elif _type is NullType:
append(_buffer, 'null')
elif hasattr(value, '__json__'):
from mo_logs import Log
Log.error("do not know how to handle")
elif hasattr(value, '__data__'):
self._typed_encode(value.__data__(), sub_schema, path, net_new_properties, _buffer)
elif hasattr(value, '__iter__'):
@ -338,11 +335,11 @@ class TypedInserter(object):
sep = COMMA
self._typed_encode(v, sub_schema, path, net_new_properties, _buffer)
count += 1
append(_buffer, ']'+COMMA+QUOTED_EXISTS_TYPE+COLON+ + text_type(count))
append(_buffer, ']' + COMMA + QUOTED_EXISTS_TYPE + COLON + text_type(count))
def _dict2json(self, value, sub_schema, path, net_new_properties, _buffer):
prefix = '{'
for k, v in ((kk, value[kk]) for kk in sorted(value.keys())):
for k, v in sort_using_key(value.items(), lambda r: r[0]):
if v == None or v == '':
continue
append(_buffer, prefix)

0
vendor/pyLibrary/queries/__init__.py поставляемый
Просмотреть файл

458
vendor/pyLibrary/queries/jx_usingMySQL.py поставляемый
Просмотреть файл

@ -1,458 +0,0 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from collections import Mapping
import mo_json
from jx_base.expressions import jx_expression
from mo_collections.matrix import Matrix
from mo_dots import coalesce
from mo_dots import wrap, listwrap, unwrap
from mo_dots.lists import FlatList
from mo_future import text_type
from mo_kwargs import override
from mo_logs import Log
from mo_logs.exceptions import suppress_exception
from mo_logs.strings import indent, expand_template
from pyLibrary import convert
from pyLibrary.sql import SQL, SQL_IS_NULL, SQL_AND, SQL_IS_NOT_NULL, SQL_ORDERBY, SQL_LIMIT, sql_iso, sql_list, SQL_TRUE, sql_alias, SQL_OR, SQL_WHERE, SQL_NOT
from pyLibrary.sql.mysql import int_list_packer
class MySQL(object):
"""
jx to MySQL DATABASE QUERIES
"""
@override
def __init__(
self,
host,
port,
username,
password,
debug=False,
schema=None,
preamble=None,
readonly=False,
kwargs=None
):
from pyLibrary.sql.mysql import MySQL
self.settings = kwargs
self._db = MySQL(kwargs)
def __data__(self):
settings = self.settings.copy()
settings.settings = None
return unwrap(settings)
def query(self, query, stacked=False):
"""
TRANSLATE JSON QUERY EXPRESSION ON SINGLE TABLE TO SQL QUERY
"""
from jx_base.query import QueryOp
query = QueryOp.wrap(query)
sql, post = self._subquery(query, isolate=False, stacked=stacked)
query.data = post(sql)
return query.data
def update(self, query):
self.db.execute("""
UPDATE {{table_name}}
SET {{assignment}}
{{where}}
""", {
"table_name": query["from"],
"assignment": ",".join(self.db.quote_column(k) + "=" + self.db.quote_value(v) for k, v in query.set),
"where": self._where2sql(query.where)
})
def _subquery(self, query, isolate=True, stacked=False):
if isinstance(query, text_type):
return self.db.quote_column(query), None
if query.name: # IT WOULD BE SAFER TO WRAP TABLE REFERENCES IN A TYPED OBJECT (Cube, MAYBE?)
return self.db.quote_column(query.name), None
if query.edges:
# RETURN A CUBE
sql, post = self._grouped(query, stacked)
else:
select = listwrap(query.select)
if select[0].aggregate != "none":
sql, post = self._aggop(query)
else:
sql, post = self._setop(query)
if isolate:
return "(\n" + sql + "\n) a\n", post
else:
return sql, post
def _grouped(self, query, stacked=False):
select = listwrap(query.select)
# RETURN SINGLE OBJECT WITH AGGREGATES
for s in select:
if s.aggregate not in aggregates:
Log.error("Expecting all columns to have an aggregate: {{select}}", select=s)
selects = FlatList()
groups = FlatList()
edges = query.edges
for e in edges:
if e.domain.type != "default":
Log.error("domain of type {{type}} not supported, yet", type=e.domain.type)
groups.append(e.value)
selects.append(sql_alias(e.value, self.db.quote_column(e.name)))
for s in select:
selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value), self.db.quote_column(s.name)))
sql = expand_template("""
SELECT
{{selects}}
FROM
{{table}}
{{where}}
GROUP BY
{{groups}}
""", {
"selects": SQL(",\n".join(selects)),
"groups": SQL(",\n".join(groups)),
"table": self._subquery(query["from"])[0],
"where": self._where2sql(query.where)
})
def post_stacked(sql):
# RETURN IN THE USUAL DATABASE RESULT SET FORMAT
return self.db.query(sql)
def post(sql):
# FIND OUT THE default DOMAIN SIZES
result = self.db.column_query(sql)
num_edges = len(edges)
for e, edge in enumerate(edges):
domain = edge.domain
if domain.type == "default":
domain.type = "set"
parts = set(result[e])
domain.partitions = [{"index": i, "value": p} for i, p in enumerate(parts)]
domain.map = {p: i for i, p in enumerate(parts)}
else:
Log.error("Do not know what to do here, yet")
# FILL THE DATA CUBE
maps = [(unwrap(e.domain.map), result[i]) for i, e in enumerate(edges)]
cubes = FlatList()
for c, s in enumerate(select):
data = Matrix(*[len(e.domain.partitions) + (1 if e.allow_nulls else 0) for e in edges])
for rownum, value in enumerate(result[c + num_edges]):
coord = [m[r[rownum]] for m, r in maps]
data[coord] = value
cubes.append(data)
if isinstance(query.select, list):
return cubes
else:
return cubes[0]
return sql, post if not stacked else post_stacked
def _aggop(self, query):
"""
SINGLE ROW RETURNED WITH AGGREGATES
"""
if isinstance(query.select, list):
# RETURN SINGLE OBJECT WITH AGGREGATES
for s in query.select:
if s.aggregate not in aggregates:
Log.error("Expecting all columns to have an aggregate: {{select}}", select=s)
selects = FlatList()
for s in query.select:
selects.append(sql_alias(aggregates[s.aggregate].replace("{{code}}", s.value),self.db.quote_column(s.name)))
sql = expand_template("""
SELECT
{{selects}}
FROM
{{table}}
{{where}}
""", {
"selects": SQL(",\n".join(selects)),
"table": self._subquery(query["from"])[0],
"where": self._where2sql(query.filter)
})
return sql, lambda sql: self.db.column(sql)[0] # RETURNING SINGLE OBJECT WITH AGGREGATE VALUES
else:
# RETURN SINGLE VALUE
s0 = query.select
if s0.aggregate not in aggregates:
Log.error("Expecting all columns to have an aggregate: {{select}}", select=s0)
select = sql_alias(aggregates[s0.aggregate].replace("{{code}}", s0.value) , self.db.quote_column(s0.name))
sql = expand_template("""
SELECT
{{selects}}
FROM
{{table}}
{{where}}
""", {
"selects": SQL(select),
"table": self._subquery(query["from"])[0],
"where": self._where2sql(query.where)
})
def post(sql):
result = self.db.column_query(sql)
return result[0][0]
return sql, post # RETURN SINGLE VALUE
def _setop(self, query):
"""
NO AGGREGATION, SIMPLE LIST COMPREHENSION
"""
if isinstance(query.select, list):
# RETURN BORING RESULT SET
selects = FlatList()
for s in listwrap(query.select):
if isinstance(s.value, Mapping):
for k, v in s.value.items:
selects.append(sql_alias(v, self.db.quote_column(s.name + "." + k)))
if isinstance(s.value, list):
for i, ss in enumerate(s.value):
selects.append(sql_alias(s.value, self.db.quote_column(s.name + "," + str(i))))
else:
selects.append(sql_alias(s.value, self.db.quote_column(s.name)))
sql = expand_template("""
SELECT
{{selects}}
FROM
{{table}}
{{where}}
{{sort}}
{{limit}}
""", {
"selects": SQL(",\n".join(selects)),
"table": self._subquery(query["from"])[0],
"where": self._where2sql(query.where),
"limit": self._limit2sql(query.limit),
"sort": self._sort2sql(query.sort)
})
def post_process(sql):
result = self.db.query(sql)
for s in listwrap(query.select):
if isinstance(s.value, Mapping):
for r in result:
r[s.name] = {}
for k, v in s.value:
r[s.name][k] = r[s.name + "." + k]
r[s.name + "." + k] = None
if isinstance(s.value, list):
# REWRITE AS TUPLE
for r in result:
r[s.name] = tuple(r[s.name + "," + str(i)] for i, ss in enumerate(s.value))
for i, ss in enumerate(s.value):
r[s.name + "," + str(i)] = None
expand_json(result)
return result
return sql, post_process # RETURN BORING RESULT SET
else:
# RETURN LIST OF VALUES
if query.select.value == ".":
select = "*"
else:
name = query.select.name
select = sql_alias(query.select.value, self.db.quote_column(name))
sql = expand_template("""
SELECT
{{selects}}
FROM
{{table}}
{{where}}
{{sort}}
{{limit}}
""", {
"selects": SQL(select),
"table": self._subquery(query["from"])[0],
"where": self._where2sql(query.where),
"limit": self._limit2sql(query.limit),
"sort": self._sort2sql(query.sort)
})
if query.select.value == ".":
def post(sql):
result = self.db.query(sql)
expand_json(result)
return result
return sql, post
else:
return sql, lambda sql: [r[name] for r in self.db.query(sql)] # RETURNING LIST OF VALUES
def _sort2sql(self, sort):
"""
RETURN ORDER BY CLAUSE
"""
if not sort:
return ""
return SQL_ORDERBY + sql_list([self.db.quote_column(o.field) + (" DESC" if o.sort == -1 else "") for o in sort])
def _limit2sql(self, limit):
return SQL("" if not limit else SQL_LIMIT + str(limit))
def _where2sql(self, where):
if where == None:
return ""
return SQL_WHERE + _esfilter2sqlwhere(self.db, where)
def esfilter2sqlwhere(db, esfilter):
return _esfilter2sqlwhere(db, esfilter)
def _esfilter2sqlwhere(db, esfilter):
"""
CONVERT ElassticSearch FILTER TO SQL FILTER
db - REQUIRED TO PROPERLY QUOTE VALUES AND COLUMN NAMES
"""
esfilter = wrap(esfilter)
if esfilter is True:
return SQL_TRUE
elif esfilter["and"]:
return sql_iso(SQL_AND.join([esfilter2sqlwhere(db, a) for a in esfilter["and"]]))
elif esfilter["or"]:
return sql_iso(SQL_OR.join([esfilter2sqlwhere(db, a) for a in esfilter["or"]]))
elif esfilter["not"]:
return SQL_NOT + sql_iso(esfilter2sqlwhere(db, esfilter["not"]))
elif esfilter.term:
return sql_iso(SQL_AND.join([
db.quote_column(col) + SQL("=") + db.quote_value(val)
for col, val in esfilter.term.items()
]))
elif esfilter.terms:
for col, v in esfilter.terms.items():
if len(v) == 0:
return "FALSE"
with suppress_exception:
int_list = convert.value2intlist(v)
has_null = False
for vv in v:
if vv == None:
has_null = True
break
if int_list:
filter = int_list_packer(col, int_list)
if has_null:
return esfilter2sqlwhere(db, {"or": [{"missing": col}, filter]})
else:
return esfilter2sqlwhere(db, filter)
else:
if has_null:
return esfilter2sqlwhere(db, {"missing": col})
else:
return "false"
return db.quote_column(col) + " in " + sql_iso(sql_list([db.quote_value(val) for val in v]))
elif esfilter.script:
return sql_iso(esfilter.script)
elif esfilter.range:
name2sign = {
"gt": SQL(">"),
"gte": SQL(">="),
"lte": SQL("<="),
"lt": SQL("<")
}
def single(col, r):
min = coalesce(r["gte"], r[">="])
max = coalesce(r["lte"], r["<="])
if min != None and max != None:
# SPECIAL CASE (BETWEEN)
sql = db.quote_column(col) + SQL(" BETWEEN ") + db.quote_value(min) + SQL_AND + db.quote_value(max)
else:
sql = SQL_AND.join(
db.quote_column(col) + name2sign[sign] + db.quote_value(value)
for sign, value in r.items()
)
return sql
output = sql_iso(SQL_AND.join([single(col, ranges) for col, ranges in esfilter.range.items()]))
return output
elif esfilter.missing:
if isinstance(esfilter.missing, text_type):
return sql_iso(db.quote_column(esfilter.missing) + SQL_IS_NULL)
else:
return sql_iso(db.quote_column(esfilter.missing.field) + SQL_IS_NULL)
elif esfilter.exists:
if isinstance(esfilter.exists, text_type):
return sql_iso(db.quote_column(esfilter.exists) + SQL_IS_NOT_NULL)
else:
return sql_iso(db.quote_column(esfilter.exists.field) + SQL_IS_NOT_NULL)
elif esfilter.match_all:
return SQL_TRUE
elif esfilter.instr:
return sql_iso(SQL_AND.join(["instr" + sql_iso(db.quote_column(col) + ", " + db.quote_value(val)) + ">0" for col, val in esfilter.instr.items()]))
else:
Log.error("Can not convert esfilter to SQL: {{esfilter}}", esfilter=esfilter)
def expand_json(rows):
# CONVERT JSON TO VALUES
for r in rows:
for k, json in list(r.items()):
if isinstance(json, text_type) and json[0:1] in ("[", "{"):
with suppress_exception:
value = mo_json.json2value(json)
r[k] = value
# MAP NAME TO SQL FUNCTION
aggregates = {
"one": "COUNT({{code}})",
"sum": "SUM({{code}})",
"add": "SUM({{code}})",
"count": "COUNT({{code}})",
"maximum": "MAX({{code}})",
"minimum": "MIN({{code}})",
"max": "MAX({{code}})",
"min": "MIN({{code}})",
"mean": "AVG({{code}})",
"average": "AVG({{code}})",
"avg": "AVG({{code}})",
"N": "COUNT({{code}})",
"X0": "COUNT({{code}})",
"X1": "SUM({{code}})",
"X2": "SUM(POWER({{code}}, 2))",
"std": "STDDEV({{code}})",
"stddev": "STDDEV({{code}})",
"var": "POWER(STDDEV({{code}}), 2)",
"variance": "POWER(STDDEV({{code}}), 2)"
}
from jx_base.container import type2container
type2container["mysql"] = MySQL

8
vendor/pyLibrary/sql/mysql.py поставляемый
Просмотреть файл

@ -16,13 +16,10 @@ import subprocess
from collections import Mapping
from datetime import datetime
from pymysql import connect, InterfaceError, cursors
import mo_json
from jx_python import jx
from mo_dots import coalesce, wrap, listwrap, unwrap
from mo_files import File
from mo_future import text_type, utf8_json_encoder, binary_type
from mo_kwargs import override
from mo_logs import Log
from mo_logs.exceptions import Except, suppress_exception
@ -31,7 +28,10 @@ from mo_logs.strings import indent
from mo_logs.strings import outdent
from mo_math import Math
from mo_times import Date
from pyLibrary.sql import SQL, SQL_NULL, SQL_SELECT, SQL_LIMIT, SQL_WHERE, SQL_LEFT_JOIN, SQL_COMMA, SQL_FROM, SQL_AND, sql_list, sql_iso, SQL_ASC, SQL_TRUE, SQL_ONE, SQL_DESC, SQL_IS_NULL, sql_alias
from pymysql import connect, InterfaceError, cursors
from mo_future import text_type, utf8_json_encoder
from pyLibrary.sql import SQL, SQL_NULL, SQL_SELECT, SQL_LIMIT, SQL_WHERE, SQL_LEFT_JOIN, SQL_FROM, SQL_AND, sql_list, sql_iso, SQL_ASC, SQL_TRUE, SQL_ONE, SQL_DESC, SQL_IS_NULL, sql_alias
from pyLibrary.sql.sqlite import join_column
DEBUG = False

67
vendor/pyLibrary/sql/sqlite.py поставляемый
Просмотреть файл

@ -17,6 +17,8 @@ import re
import sys
from collections import Mapping
from mo_kwargs import override
from mo_future import allocate_lock as _allocate_lock, text_type, zip_longest
from mo_dots import Data, coalesce
from mo_files import File
@ -48,9 +50,9 @@ def _upgrade():
global sqlite3
try:
Log.note("sqlite not upgraded ")
Log.note("sqlite not upgraded")
# return
#
#
# import sys
# import platform
# if "windows" in platform.system().lower():
@ -59,7 +61,7 @@ def _upgrade():
# source_dll = File("vendor/pyLibrary/vendor/sqlite/sqlite3_32.dll")
# else:
# source_dll = File("vendor/pyLibrary/vendor/sqlite/sqlite3_64.dll")
#
#
# if not all(a == b for a, b in zip_longest(source_dll.read_bytes(), original_dll.read_bytes())):
# original_dll.backup()
# File.copy(source_dll, original_dll)
@ -81,7 +83,8 @@ class Sqlite(DB):
canonical = None
def __init__(self, filename=None, db=None, upgrade=True):
@override
def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None):
"""
:param db: Optional, wrap a sqlite db in a thread
:return: Multithread-safe database
@ -89,6 +92,7 @@ class Sqlite(DB):
if upgrade and not _upgraded:
_upgrade()
self.settings = kwargs
self.filename = File(filename).abspath
self.db = db
self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS
@ -96,6 +100,8 @@ class Sqlite(DB):
self.get_trace = TRACE
self.upgrade = upgrade
self.closed = False
if DEBUG:
Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
def _enhancements(self):
def regex(pattern, value):
@ -196,28 +202,16 @@ class Sqlite(DB):
try:
if DEBUG:
Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
if Sqlite.canonical:
self.db = Sqlite.canonical
else:
self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread = False)
try:
if Sqlite.canonical:
self.db = Sqlite.canonical
else:
self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread=False)
except Exception as e:
Log.error("could not open file {{filename}}", filename=self.filename)
library_loc = File.new_instance(sys.modules[__name__].__file__, "../..")
full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
try:
trace = extract_stack(0)[0]
if self.upgrade:
if os.name == 'nt':
file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so")
else:
file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions")
full_path = file.abspath
self.db.enable_load_extension(True)
self.db.execute(SQL_SELECT + "load_extension" + sql_iso(self.quote_value(full_path)))
except Exception as e:
if not _load_extension_warning_sent:
_load_extension_warning_sent = True
Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e)
if self.settings.load_functions:
self._load_functions()
while not please_stop:
quad = self.queue.pop(till=please_stop)
@ -283,11 +277,25 @@ class Sqlite(DB):
Log.note("Database is closed")
self.db.close()
def quote_column(self, column_name, table=None):
return quote_column(column_name, table)
def _load_functions(self):
global _load_extension_warning_sent
library_loc = File.new_instance(sys.modules[__name__].__file__, "../..")
full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
try:
trace = extract_stack(0)[0]
if self.upgrade:
if os.name == 'nt':
file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so")
else:
file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions")
def quote_value(self, value):
return quote_value(value)
full_path = file.abspath
self.db.enable_load_extension(True)
self.db.execute(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path)))
except Exception as e:
if not _load_extension_warning_sent:
_load_extension_warning_sent = True
Log.warning("Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e)
def create_new_functions(self):
@ -297,6 +305,7 @@ class Sqlite(DB):
self.db.create_function("REGEXP", 2, regexp)
_no_need_to_quote = re.compile(r"^\w+$", re.UNICODE)