lib updates
move config files (no more secrets!) each bug has own db connection
This commit is contained in:
Родитель
83c16b49c5
Коммит
d4cb9d15c3
|
@ -15,3 +15,4 @@ build
|
|||
dist
|
||||
|
||||
/pyLibrary/.svn
|
||||
/results
|
||||
|
|
|
@ -1,12 +1,26 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from bzETL.extract_bugzilla import get_all_cc_changes
|
||||
from pyLibrary.env import startup, elasticsearch
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.queries.es_query import ESQuery
|
||||
from pyLibrary.sql.db import DB
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.collections.multiset import Multiset
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.struct import nvl, set_default
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.collections import Multiset
|
||||
from pyLibrary.debugs import startup
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import set_default, coalesce
|
||||
from pyLibrary.env import elasticsearch
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.qb_usingES import FromES
|
||||
from pyLibrary.sql.mysql import MySQL
|
||||
|
||||
|
||||
def full_analysis(settings, bug_list=None, please_stop=None):
|
||||
|
@ -24,18 +38,18 @@ def full_analysis(settings, bug_list=None, please_stop=None):
|
|||
analyzer = AliasAnalyzer(settings.alias)
|
||||
|
||||
if bug_list:
|
||||
with DB(settings.bugzilla, readonly=True) as db:
|
||||
with MySQL(settings.bugzilla, readonly=True) as db:
|
||||
data = get_all_cc_changes(db, bug_list)
|
||||
analyzer.aggregator(data)
|
||||
analyzer.analysis(True, please_stop)
|
||||
return
|
||||
|
||||
with DB(settings.bugzilla, readonly=True) as db:
|
||||
start = nvl(settings.alias.start, 0)
|
||||
end = nvl(settings.alias.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
|
||||
with MySQL(settings.bugzilla, readonly=True) as db:
|
||||
start = coalesce(settings.alias.start, 0)
|
||||
end = coalesce(settings.alias.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
|
||||
|
||||
#Perform analysis on blocks of bugs, in case we crash partway through
|
||||
for s, e in Q.intervals(start, end, settings.alias.increment):
|
||||
for s, e in qb.intervals(start, end, settings.alias.increment):
|
||||
Log.note("Load range {{start}}-{{end}}", {
|
||||
"start": s,
|
||||
"end": e
|
||||
|
@ -56,7 +70,7 @@ class AliasAnalyzer(object):
|
|||
try:
|
||||
a = set_default({}, settings.elasticsearch, {"type":"alias"})
|
||||
self.es = elasticsearch.Cluster(settings.elasticsearch).get_or_create_index(a, ALIAS_SCHEMA, limit_replicas=True)
|
||||
self.esq = ESQuery(self.es)
|
||||
self.esq = FromES(self.es)
|
||||
result = self.esq.query({
|
||||
"from":"bug_aliases",
|
||||
"select":["canonical", "alias"]
|
||||
|
@ -69,7 +83,7 @@ class AliasAnalyzer(object):
|
|||
# LOAD THE NON-MATCHES
|
||||
na = set_default({}, settings.elasticsearch, {"type":"not_alias"})
|
||||
es = elasticsearch.Cluster(na).get_or_create_index(na)
|
||||
esq = ESQuery(es)
|
||||
esq = FromES(es)
|
||||
result = esq.query({
|
||||
"from":"bug_aliases",
|
||||
"select":["canonical", "alias"]
|
||||
|
@ -110,7 +124,7 @@ class AliasAnalyzer(object):
|
|||
if count < 0:
|
||||
problem_agg.add(self.alias(email)["canonical"], amount=count)
|
||||
|
||||
problems = Q.sort([
|
||||
problems = qb.sort([
|
||||
{"email": e, "count": c}
|
||||
for e, c in problem_agg.dic.iteritems()
|
||||
if not self.not_aliases.get(e, None) and (c <= -(DIFF / 2) or last_run)
|
||||
|
@ -126,7 +140,7 @@ class AliasAnalyzer(object):
|
|||
for bug_id, agg in self.bugs.iteritems():
|
||||
if agg.dic.get(problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem
|
||||
solution_agg += agg
|
||||
solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])
|
||||
solutions = qb.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])
|
||||
|
||||
if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
|
||||
#exact match
|
||||
|
@ -140,7 +154,7 @@ class AliasAnalyzer(object):
|
|||
"problem": problem.email,
|
||||
"score": problem.count,
|
||||
"solution": best_solution.email,
|
||||
"matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
|
||||
"matches": convert.value2json(qb.select(solutions, "count")[:10:])
|
||||
})
|
||||
try_again = True
|
||||
self.add_alias(problem.email, best_solution.email)
|
||||
|
|
145
bzETL/bz_etl.py
145
bzETL/bz_etl.py
|
@ -1,6 +1,5 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
@ -8,26 +7,27 @@
|
|||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
# REPLACES THE KETTLE FLOW CONTROL PROGRAM, AND BASH SCRIPT
|
||||
|
||||
|
||||
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary import struct, jsons
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.struct import Struct, nvl
|
||||
from bzETL import extract_bugzilla, transform_bugzilla, alias_analysis, parse_bug_history
|
||||
from bzETL.extract_bugzilla import *
|
||||
from bzETL.parse_bug_history import BugHistoryParser
|
||||
from pyLibrary import jsons, convert
|
||||
from pyLibrary.debugs import startup, constants
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import wrap, coalesce, Dict, listwrap, set_default
|
||||
from pyLibrary.env import elasticsearch
|
||||
from pyLibrary.env.elasticsearch import Cluster
|
||||
from pyLibrary.env.files import File
|
||||
from pyLibrary.env import startup
|
||||
from pyLibrary.thread.threads import Queue, Thread, AllThread, Lock, ThreadedQueue
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.env.elasticsearch import ElasticSearch
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.sql.db import DB
|
||||
|
||||
from bzETL import parse_bug_history, transform_bugzilla, extract_bugzilla, alias_analysis
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.sql.mysql import MySQL
|
||||
from pyLibrary.thread.threads import Lock, AllThread, Thread, Queue, ThreadedQueue
|
||||
from pyLibrary.times.timer import Timer
|
||||
from extract_bugzilla import get_private_bugs_for_delete, get_recent_private_attachments, get_recent_private_comments, get_comments, get_comments_by_id, get_recent_private_bugs, get_current_time, get_bugs, get_dependencies, get_flags, get_new_activities, get_bug_see_also, get_attachments, get_tracking_flags, get_keywords, get_cc, get_bug_groups, get_duplicates
|
||||
from parse_bug_history import BugHistoryParser
|
||||
|
||||
|
||||
db_cache_lock = Lock()
|
||||
|
@ -55,14 +55,14 @@ def etl_comments(db, es, param, please_stop):
|
|||
# CONNECTIONS ARE EXPENSIVE, CACHE HERE
|
||||
with comment_db_cache_lock:
|
||||
if not comment_db_cache:
|
||||
comment_db = DB(db)
|
||||
comment_db = MySQL(db.settings)
|
||||
comment_db_cache.append(comment_db)
|
||||
|
||||
with comment_db_cache_lock:
|
||||
Log.note("Read comments from database")
|
||||
comments = get_comments(comment_db_cache[0], param)
|
||||
|
||||
for g, c in Q.groupby(comments, size=500):
|
||||
for g, c in qb.groupby(comments, size=500):
|
||||
with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
|
||||
es.extend({"id": cc.comment_id, "value": cc} for cc in c)
|
||||
|
||||
|
@ -72,27 +72,35 @@ def etl(db, output_queue, param, please_stop):
|
|||
PROCESS RANGE, AS SPECIFIED IN param AND PUSH
|
||||
BUG VERSION RECORDS TO output_queue
|
||||
"""
|
||||
NUM_CONNECTIONS = 10
|
||||
|
||||
# CONNECTIONS ARE EXPENSIVE, CACHE HERE
|
||||
# MAKING CONNECTIONS ARE EXPENSIVE, CACHE HERE
|
||||
with db_cache_lock:
|
||||
if not db_cache:
|
||||
with Timer("open connections to db"):
|
||||
for f in get_stuff_from_bugzilla:
|
||||
db = DB(db)
|
||||
db_cache.append(db)
|
||||
for i in range(NUM_CONNECTIONS):
|
||||
db_cache.append(MySQL(db.settings))
|
||||
|
||||
db_results = Queue(name="db results", max=2**30)
|
||||
|
||||
def get_records_from_bugzilla(db, param, please_stop):
|
||||
for get_stuff in get_stuff_from_bugzilla:
|
||||
if please_stop:
|
||||
break
|
||||
db_results.extend(get_stuff(db, param))
|
||||
|
||||
db_results = Queue(max=2**30)
|
||||
with db_cache_lock:
|
||||
# ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
|
||||
with AllThread() as all:
|
||||
for i, f in enumerate(get_stuff_from_bugzilla):
|
||||
def process(target, db, param, please_stop):
|
||||
db_results.extend(target(db, param))
|
||||
|
||||
all.add(process, f, db_cache[i], param.copy())
|
||||
with db_cache_lock:
|
||||
# SPLIT TASK EVENLY, HAVE EACH BUG USE SAME CONNECTION FOR ALL DATA
|
||||
size = Math.ceiling(float(len(param.bug_list))/float(10))
|
||||
for g, bug_ids in qb.groupby(param.bug_list, size=size):
|
||||
all.add(get_records_from_bugzilla, db_cache[g], set_default(
|
||||
{"bug_list": bug_ids},
|
||||
param
|
||||
))
|
||||
db_results.add(Thread.STOP)
|
||||
|
||||
sorted = Q.sort(db_results, [
|
||||
sorted = qb.sort(db_results, [
|
||||
"bug_id",
|
||||
"_merge_order",
|
||||
{"field": "modified_ts", "sort": -1},
|
||||
|
@ -102,7 +110,7 @@ def etl(db, output_queue, param, please_stop):
|
|||
process = BugHistoryParser(param, output_queue)
|
||||
for s in sorted:
|
||||
process.processRow(s)
|
||||
process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
|
||||
process.processRow(wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
|
||||
|
||||
|
||||
def run_both_etl(db, output_queue, es_comments, param):
|
||||
|
@ -128,8 +136,8 @@ def setup_es(settings, db, es, es_comments):
|
|||
# INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX
|
||||
last_run_time = long(File(settings.param.last_run_time).read())
|
||||
if not es:
|
||||
es = ElasticSearch(settings.es)
|
||||
es_comments = ElasticSearch(settings.es_comments)
|
||||
es = elasticsearch.Index(settings.es)
|
||||
es_comments = elasticsearch.Index(settings.es_comments)
|
||||
elif File(settings.param.first_run_time).exists:
|
||||
# DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL
|
||||
try:
|
||||
|
@ -137,17 +145,17 @@ def setup_es(settings, db, es, es_comments):
|
|||
current_run_time = long(File(settings.param.first_run_time).read())
|
||||
if not es:
|
||||
if not settings.es.alias:
|
||||
temp = ElasticSearch(settings.es).get_proto(settings.es.index)
|
||||
temp = Cluster(settings.es).get_proto(settings.es.index)
|
||||
settings.es.alias = settings.es.index
|
||||
settings.es.index = temp.last()
|
||||
es = ElasticSearch(settings.es)
|
||||
es = elasticsearch.Index(settings.es)
|
||||
es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY
|
||||
|
||||
if not settings.es_comments.alias:
|
||||
temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index)
|
||||
temp = Cluster(settings.es_comments).get_proto(settings.es_comments.index)
|
||||
settings.es_comments.alias = settings.es_comments.index
|
||||
settings.es_comments.index = temp.last()
|
||||
es_comments = ElasticSearch(settings.es_comments)
|
||||
es_comments = elasticsearch.Index(settings.es_comments)
|
||||
except Exception, e:
|
||||
Log.warning("can not resume ETL, restarting", e)
|
||||
File(settings.param.first_run_time).delete()
|
||||
|
@ -160,23 +168,23 @@ def setup_es(settings, db, es, es_comments):
|
|||
schema = File(settings.es.schema_file).read()
|
||||
if transform_bugzilla.USE_ATTACHMENTS_DOT:
|
||||
schema = schema.replace("attachments_", "attachments\\.")
|
||||
schema=CNV.JSON2object(schema, paths=True)
|
||||
schema=convert.json2value(schema, paths=True)
|
||||
schema.settings=jsons.expand_dot(schema.settings)
|
||||
if not settings.es.alias:
|
||||
settings.es.alias = settings.es.index
|
||||
settings.es.index = ElasticSearch.proto_name(settings.es.alias)
|
||||
es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True)
|
||||
settings.es.index = Cluster.proto_name(settings.es.alias)
|
||||
es = Cluster.create_index(settings.es, schema, limit_replicas=True)
|
||||
|
||||
# BUG COMMENTS
|
||||
comment_schema = File(settings.es_comments.schema_file).read()
|
||||
comment_schema=CNV.JSON2object(comment_schema, paths=True)
|
||||
comment_schema=convert.json2value(comment_schema, paths=True)
|
||||
comment_schema.settings=jsons.expand_dot(comment_schema.settings)
|
||||
if not settings.es_comments.alias:
|
||||
settings.es_comments.alias = settings.es_comments.index
|
||||
settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias)
|
||||
es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True)
|
||||
settings.es_comments.index = Cluster.proto_name(settings.es_comments.alias)
|
||||
es_comments = Cluster.create_index(settings.es_comments, comment_schema, limit_replicas=True)
|
||||
|
||||
File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
|
||||
File(settings.param.first_run_time).write(unicode(convert.datetime2milli(current_run_time)))
|
||||
|
||||
return current_run_time, es, es_comments, last_run_time
|
||||
|
||||
|
@ -190,7 +198,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
#REMOVE PRIVATE BUGS
|
||||
private_bugs = get_private_bugs_for_delete(db, param)
|
||||
Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": sorted(private_bugs)})
|
||||
for g, delete_bugs in Q.groupby(private_bugs, size=1000):
|
||||
for g, delete_bugs in qb.groupby(private_bugs, size=1000):
|
||||
still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
|
||||
if still_existing:
|
||||
Log.note("Ensure the following existing private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": sorted(still_existing)})
|
||||
|
@ -212,7 +220,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
|
||||
#REMOVE **RECENT** PRIVATE ATTACHMENTS
|
||||
private_attachments = get_recent_private_attachments(db, param)
|
||||
bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
|
||||
bugs_to_refresh = set(qb.select(private_attachments, "bug_id"))
|
||||
es.delete_record({"terms": {"bug_id": bugs_to_refresh}})
|
||||
|
||||
#REBUILD BUGS THAT GOT REMOVED
|
||||
|
@ -234,7 +242,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
|
||||
#REFRESH COMMENTS WITH PRIVACY CHANGE
|
||||
private_comments = get_recent_private_comments(db, param)
|
||||
comment_list = set(Q.select(private_comments, "comment_id")) | {0}
|
||||
comment_list = set(qb.select(private_comments, "comment_id")) | {0}
|
||||
es_comments.delete_record({"terms": {"comment_id": comment_list}})
|
||||
changed_comments = get_comments_by_id(db, comment_list, param)
|
||||
es_comments.extend({"id": c.comment_id, "value": c} for c in changed_comments)
|
||||
|
@ -242,7 +250,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
#GET LIST OF CHANGED BUGS
|
||||
with Timer("time to get changed bug list"):
|
||||
if param.allow_private_bugs:
|
||||
bug_list = Q.select(db.query("""
|
||||
bug_list = qb.select(db.query("""
|
||||
SELECT
|
||||
b.bug_id
|
||||
FROM
|
||||
|
@ -253,7 +261,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
"start_time_str": param.start_time_str
|
||||
}), u"bug_id")
|
||||
else:
|
||||
bug_list = Q.select(db.query("""
|
||||
bug_list = qb.select(db.query("""
|
||||
SELECT
|
||||
b.bug_id
|
||||
FROM
|
||||
|
@ -286,10 +294,10 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
|
|||
|
||||
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue):
|
||||
with Thread.run("alias_analysis", alias_analysis.full_analysis, settings=settings):
|
||||
end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
|
||||
start = nvl(settings.param.start, 0)
|
||||
end = coalesce(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
|
||||
start = coalesce(settings.param.start, 0)
|
||||
if resume_from_last_run:
|
||||
start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))
|
||||
start = coalesce(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))
|
||||
|
||||
#############################################################
|
||||
## MAIN ETL LOOP
|
||||
|
@ -297,7 +305,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
|
|||
|
||||
#TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
|
||||
# with Multithread([run_both_etl, run_both_etl]) as workers:
|
||||
for min, max in Q.intervals(start, end, settings.param.increment):
|
||||
for min, max in qb.intervals(start, end, settings.param.increment):
|
||||
if settings.args.quick and min < end - settings.param.increment and min != 0:
|
||||
#--quick ONLY DOES FIRST AND LAST BLOCKS
|
||||
continue
|
||||
|
@ -306,7 +314,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
|
|||
#GET LIST OF CHANGED BUGS
|
||||
with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}):
|
||||
if param.allow_private_bugs:
|
||||
bug_list = Q.select(db.query("""
|
||||
bug_list = qb.select(db.query("""
|
||||
SELECT
|
||||
b.bug_id
|
||||
FROM
|
||||
|
@ -320,7 +328,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
|
|||
"start_time_str": param.start_time_str
|
||||
}), u"bug_id")
|
||||
else:
|
||||
bug_list = Q.select(db.query("""
|
||||
bug_list = qb.select(db.query("""
|
||||
SELECT
|
||||
b.bug_id
|
||||
FROM
|
||||
|
@ -363,17 +371,17 @@ def main(settings, es=None, es_comments=None):
|
|||
|
||||
#MAKE HANDLES TO CONTAINERS
|
||||
try:
|
||||
with DB(settings.bugzilla, readonly=True) as db:
|
||||
with MySQL(settings.bugzilla, readonly=True) as db:
|
||||
current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments)
|
||||
|
||||
with ThreadedQueue(es, size=500, silent=True) as output_queue:
|
||||
with ThreadedQueue(es, max_size=500, silent=True) as output_queue:
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
# DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
# MySQL WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
|
||||
# THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
|
||||
# THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
|
||||
param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK
|
||||
param.start_time = last_run_time - coalesce(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
|
||||
param.alias_file = settings.param.alias_file
|
||||
param.allow_private_bugs = settings.param.allow_private_bugs
|
||||
|
@ -395,7 +403,7 @@ def main(settings, es=None, es_comments=None):
|
|||
es.delete_all_but(settings.es_comments.alias, settings.es_comments.index)
|
||||
es_comments.add_alias(settings.es_comments.alias)
|
||||
|
||||
File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
|
||||
File(settings.param.last_run_time).write(unicode(convert.datetime2milli(current_run_time)))
|
||||
except Exception, e:
|
||||
Log.error("Problem with main ETL loop", e)
|
||||
finally:
|
||||
|
@ -454,11 +462,13 @@ def get_max_bug_id(es):
|
|||
|
||||
|
||||
def close_db_connections():
|
||||
(globals()["db_cache"], temp) = ([], db_cache)
|
||||
global db_cache, comment_db_cache
|
||||
|
||||
db_cache, temp = [], db_cache
|
||||
for db in temp:
|
||||
db.close()
|
||||
|
||||
(globals()["comment_db_cache"], temp) = ([], comment_db_cache)
|
||||
comment_db_cache, temp = [], comment_db_cache
|
||||
for db in temp:
|
||||
db.close()
|
||||
|
||||
|
@ -476,10 +486,11 @@ def start():
|
|||
"action": "store_true",
|
||||
"dest": "restart"
|
||||
}])
|
||||
constants.set(settings.constants)
|
||||
|
||||
with startup.SingleInstance(flavor_id=settings.args.filename):
|
||||
if settings.args.restart:
|
||||
for l in struct.listwrap(settings.debug.log):
|
||||
for l in listwrap(settings.debug.log):
|
||||
if l.filename:
|
||||
File(l.filename).parent.delete()
|
||||
File(settings.param.first_run_time).delete()
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# PYTHON VERSION OF https://github.com/mozilla-metrics/bugzilla_etl/blob/master/transformations/bugzilla_to_json.ktr
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from bzETL.parse_bug_history import MAX_TIME
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.queries.db_query import esfilter2sqlwhere
|
||||
from pyLibrary.sql.db import SQL
|
||||
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.struct import Struct
|
||||
|
||||
|
||||
#ALL BUGS IN PRIVATE ETL HAVE SCREENED FIELDS
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
|
||||
from pyLibrary.sql import SQL
|
||||
from pyLibrary.times.timer import Timer
|
||||
|
||||
#ALL BUGS IN PRIVATE ETL HAVE SCREENED FIELDS
|
||||
SCREENED_FIELDDEFS = [
|
||||
19, #bug_file_loc
|
||||
24, #short_desc
|
||||
|
@ -67,7 +67,7 @@ def get_current_time(db):
|
|||
SELECT
|
||||
UNIX_TIMESTAMP(now()) `value`
|
||||
""")[0].value
|
||||
return CNV.unix2datetime(output)
|
||||
return convert.unix2datetime(output)
|
||||
|
||||
|
||||
def milli2string(db, value):
|
||||
|
@ -90,7 +90,7 @@ def get_screened_whiteboard(db):
|
|||
groups = db.query("SELECT id FROM groups WHERE {{where}}", {
|
||||
"where": esfilter2sqlwhere(db, {"terms": {"name": SCREENED_WHITEBOARD_BUG_GROUPS}})
|
||||
})
|
||||
globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
|
||||
globals()["SCREENED_BUG_GROUP_IDS"] = qb.select(groups, "id")
|
||||
|
||||
|
||||
def get_bugs_table_columns(db, schema_name):
|
||||
|
@ -226,7 +226,7 @@ def get_bugs(db, param):
|
|||
else:
|
||||
return db.quote_column(col.column_name)
|
||||
|
||||
param.bugs_columns = Q.select(bugs_columns, "column_name")
|
||||
param.bugs_columns = qb.select(bugs_columns, "column_name")
|
||||
param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns]))
|
||||
param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}})
|
||||
param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [
|
||||
|
@ -290,7 +290,7 @@ def get_bugs(db, param):
|
|||
def flatten_bugs_record(r, output):
|
||||
for field_name, value in r.items():
|
||||
if value != "---":
|
||||
newRow = Struct()
|
||||
newRow = Dict()
|
||||
newRow.bug_id = r.bug_id
|
||||
newRow.modified_ts = r.modified_ts
|
||||
newRow.modified_by = r.modified_by
|
||||
|
@ -523,7 +523,7 @@ def flatten_attachments(data):
|
|||
for k,v in r.items():
|
||||
if k=="bug_id":
|
||||
continue
|
||||
output.append(Struct(
|
||||
output.append(Dict(
|
||||
bug_id=r.bug_id,
|
||||
modified_ts=r.modified_ts,
|
||||
modified_by=r.modified_by,
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
# Workflow:
|
||||
# Create the current state object
|
||||
|
@ -37,20 +37,20 @@
|
|||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import math
|
||||
from pyLibrary import struct, strings
|
||||
|
||||
from pyLibrary import convert, strings
|
||||
from pyLibrary.collections import MIN
|
||||
from pyLibrary.strings import apply_diff
|
||||
from pyLibrary.struct import nvl, StructList, unwrap, wrap
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.struct import Struct, Null
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Null, wrap, DictList, Dict, coalesce, unwrap, inverse
|
||||
from pyLibrary.env.files import File
|
||||
|
||||
from transform_bugzilla import normalize, NUMERIC_FIELDS, MULTI_FIELDS, DIFF_FIELDS
|
||||
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.strings import apply_diff
|
||||
from bzETL.transform_bugzilla import normalize, NUMERIC_FIELDS, MULTI_FIELDS, DIFF_FIELDS
|
||||
|
||||
|
||||
# Used to split a flag into (type, status [,requestee])
|
||||
|
@ -76,7 +76,7 @@ MAX_TIME = 9999999999000
|
|||
class BugHistoryParser():
|
||||
def __init__(self, settings, output_queue):
|
||||
self.aliases = Null
|
||||
self.startNewBug(struct.wrap({"bug_id": 0, "modified_ts": 0, "_merge_order": 1}))
|
||||
self.startNewBug(wrap({"bug_id": 0, "modified_ts": 0, "_merge_order": 1}))
|
||||
self.prevActivityID = Null
|
||||
self.prev_row = Null
|
||||
self.settings = settings
|
||||
|
@ -107,8 +107,8 @@ class BugHistoryParser():
|
|||
# Bugzilla bug workaround - some values were truncated, introducing uncertainty / errors:
|
||||
# https://bugzilla.mozilla.org/show_bug.cgi?id=55161
|
||||
if row_in.field_name in TRUNC_FIELDS:
|
||||
added = CNV.value2string(row_in.new_value)
|
||||
removed = CNV.value2string(row_in.old_value)
|
||||
added = convert.value2string(row_in.new_value)
|
||||
removed = convert.value2string(row_in.old_value)
|
||||
uncertain = False
|
||||
|
||||
if added in ["? ?", "?"]: # Unknown value extracted from a possibly truncated field
|
||||
|
@ -131,7 +131,7 @@ class BugHistoryParser():
|
|||
# Process the "uncertain" flag as an activity
|
||||
# WE ARE GOING BACKWARDS IN TIME, SO MARKUP PAST
|
||||
Log.note("[Bug {{bug_id}}]: PROBLEM Setting this bug to be uncertain.", {"bug_id": self.currBugID})
|
||||
self.processBugsActivitiesTableItem(struct.wrap({
|
||||
self.processBugsActivitiesTableItem(wrap({
|
||||
"modified_ts": row_in.modified_ts,
|
||||
"modified_by": row_in.modified_by,
|
||||
"field_name": "uncertain",
|
||||
|
@ -144,7 +144,7 @@ class BugHistoryParser():
|
|||
return
|
||||
|
||||
# Treat timestamps as int values
|
||||
new_value = CNV.value2int(row_in.new_value) if row_in.field_name.endswith("_ts") else row_in.new_value
|
||||
new_value = convert.value2int(row_in.new_value) if row_in.field_name.endswith("_ts") else row_in.new_value
|
||||
|
||||
|
||||
# Determine where we are in the bug processing workflow
|
||||
|
@ -181,11 +181,11 @@ class BugHistoryParser():
|
|||
|
||||
def startNewBug(self, row_in):
|
||||
self.prevBugID = row_in.bug_id
|
||||
self.bugVersions = StructList()
|
||||
self.bugVersionsMap = Struct()
|
||||
self.currActivity = Struct()
|
||||
self.currBugAttachmentsMap = Struct()
|
||||
self.currBugState = Struct(
|
||||
self.bugVersions = DictList()
|
||||
self.bugVersionsMap = Dict()
|
||||
self.currActivity = Dict()
|
||||
self.currBugAttachmentsMap = Dict()
|
||||
self.currBugState = Dict(
|
||||
_id=BugHistoryParser.uid(row_in.bug_id, row_in.modified_ts),
|
||||
bug_id=row_in.bug_id,
|
||||
modified_ts=row_in.modified_ts,
|
||||
|
@ -199,7 +199,7 @@ class BugHistoryParser():
|
|||
#WE FORCE ADD ALL SETS, AND WE WILL scrub() THEM OUT LATER IF NOT USED
|
||||
for f in MULTI_FIELDS:
|
||||
self.currBugState[f] = set([])
|
||||
self.currBugState.flags = StructList() #FLAGS ARE MULTI_FIELDS, BUT ARE ALSO STRUCTS, SO MUST BE IN AN ARRAY
|
||||
self.currBugState.flags = DictList() #FLAGS ARE MULTI_FIELDS, BUT ARE ALSO STRUCTS, SO MUST BE IN AN ARRAY
|
||||
|
||||
if row_in._merge_order != 1:
|
||||
# Problem: No entry found in the 'bugs' table.
|
||||
|
@ -229,7 +229,7 @@ class BugHistoryParser():
|
|||
if currActivityID != self.prevActivityID:
|
||||
self.prevActivityID = currActivityID
|
||||
|
||||
self.currActivity = Struct(
|
||||
self.currActivity = Dict(
|
||||
_id=currActivityID,
|
||||
modified_ts=row_in.modified_ts,
|
||||
modified_by=row_in.modified_by,
|
||||
|
@ -251,7 +251,7 @@ class BugHistoryParser():
|
|||
"modified_ts": row_in.modified_ts,
|
||||
"created_ts": row_in.created_ts,
|
||||
"modified_by": row_in.modified_by,
|
||||
"flags": StructList()
|
||||
"flags": DictList()
|
||||
}
|
||||
self.currBugAttachmentsMap[unicode(row_in.attach_id)] = att
|
||||
|
||||
|
@ -292,7 +292,7 @@ class BugHistoryParser():
|
|||
if currActivityID != self.prevActivityID:
|
||||
self.currActivity = self.bugVersionsMap[currActivityID]
|
||||
if self.currActivity == None:
|
||||
self.currActivity = Struct(
|
||||
self.currActivity = Dict(
|
||||
_id=currActivityID,
|
||||
modified_ts=row_in.modified_ts,
|
||||
modified_by=row_in.modified_by,
|
||||
|
@ -377,7 +377,7 @@ class BugHistoryParser():
|
|||
def populateIntermediateVersionObjects(self):
|
||||
# Make sure the self.bugVersions are in descending order by modification time.
|
||||
# They could be mixed because of attachment activity
|
||||
self.bugVersions = Q.sort(self.bugVersions, [
|
||||
self.bugVersions = qb.sort(self.bugVersions, [
|
||||
{"field": "modified_ts", "sort": -1}
|
||||
])
|
||||
|
||||
|
@ -385,7 +385,7 @@ class BugHistoryParser():
|
|||
prevValues = {}
|
||||
currVersion = Null
|
||||
# Prime the while loop with an empty next version so our first iteration outputs the initial bug state
|
||||
nextVersion = Struct(_id=self.currBugState._id, changes=[])
|
||||
nextVersion = Dict(_id=self.currBugState._id, changes=[])
|
||||
|
||||
flagMap = {}
|
||||
# A monotonically increasing version number (useful for debugging)
|
||||
|
@ -431,7 +431,7 @@ class BugHistoryParser():
|
|||
mergeBugVersion = True
|
||||
|
||||
# Link this version to the next one (if there is a next one)
|
||||
self.currBugState.expires_on = nvl(nextVersion.modified_ts, MAX_TIME)
|
||||
self.currBugState.expires_on = coalesce(nextVersion.modified_ts, MAX_TIME)
|
||||
|
||||
# Copy all attributes from the current version into self.currBugState
|
||||
for propName, propValue in currVersion.items():
|
||||
|
@ -439,7 +439,7 @@ class BugHistoryParser():
|
|||
|
||||
# Now walk self.currBugState forward in time by applying the changes from currVersion
|
||||
#BE SURE TO APPLY REMOVES BEFORE ADDS, JUST IN CASE BOTH HAPPENED TO ONE FIELD
|
||||
changes = Q.sort(currVersion.changes, ["attach_id", "field_name", {"field": "old_value", "sort": -1}, "new_value"])
|
||||
changes = qb.sort(currVersion.changes, ["attach_id", "field_name", {"field": "old_value", "sort": -1}, "new_value"])
|
||||
currVersion.changes = changes
|
||||
self.currBugState.changes = changes
|
||||
|
||||
|
@ -461,7 +461,7 @@ class BugHistoryParser():
|
|||
continue
|
||||
|
||||
if DEBUG_CHANGES:
|
||||
Log.note("Processing change: " + CNV.object2JSON(change))
|
||||
Log.note("Processing change: " + convert.value2json(change))
|
||||
target = self.currBugState
|
||||
targetName = "currBugState"
|
||||
attach_id = change.attach_id
|
||||
|
@ -562,7 +562,7 @@ class BugHistoryParser():
|
|||
def processFlagChange(self, target, change, modified_ts, modified_by):
|
||||
if target.flags == None:
|
||||
Log.note("[Bug {{bug_id}}]: PROBLEM processFlagChange called with unset 'flags'", {"bug_id": self.currBugState.bug_id})
|
||||
target.flags = StructList()
|
||||
target.flags = DictList()
|
||||
|
||||
addedFlags = BugHistoryParser.getMultiFieldValue("flags", change.new_value)
|
||||
removedFlags = BugHistoryParser.getMultiFieldValue("flags", change.old_value)
|
||||
|
@ -685,7 +685,7 @@ class BugHistoryParser():
|
|||
|
||||
if chosen_one != None:
|
||||
for f in ["value", "request_status", "requestee"]:
|
||||
chosen_one[f] = nvl(added_flag[f], chosen_one[f])
|
||||
chosen_one[f] = coalesce(added_flag[f], chosen_one[f])
|
||||
|
||||
# We need to avoid later adding this flag twice, since we rolled an add into a delete.
|
||||
|
||||
|
@ -723,7 +723,7 @@ class BugHistoryParser():
|
|||
# if flag==u'review?(bjacob@mozilla.co':
|
||||
# Log.debug()
|
||||
|
||||
flagParts = Struct(
|
||||
flagParts = Dict(
|
||||
modified_ts=modified_ts,
|
||||
modified_by=modified_by,
|
||||
value=flag
|
||||
|
@ -742,7 +742,7 @@ class BugHistoryParser():
|
|||
def addValues(self, total, add, valueType, field_name, target):
|
||||
if not add:
|
||||
return total
|
||||
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + CNV.object2JSON(someValues))
|
||||
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + convert.value2json(someValues))
|
||||
if field_name == "flags":
|
||||
Log.error("use processFlags")
|
||||
else:
|
||||
|
@ -763,7 +763,7 @@ class BugHistoryParser():
|
|||
self.currActivity.changes.append({
|
||||
"field_name": field_name,
|
||||
"new_value": Null,
|
||||
"old_value": ", ".join(map(unicode, Q.sort(diff))),
|
||||
"old_value": ", ".join(map(unicode, qb.sort(diff))),
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
|
||||
|
@ -780,7 +780,7 @@ class BugHistoryParser():
|
|||
if valueType == "added" and remove:
|
||||
self.currActivity.changes.append({
|
||||
"field_name": field_name,
|
||||
"new_value": u", ".join(map(unicode, Q.sort(remove))),
|
||||
"new_value": u", ".join(map(unicode, qb.sort(remove))),
|
||||
"old_value": Null,
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
|
@ -800,8 +800,8 @@ class BugHistoryParser():
|
|||
return output
|
||||
elif field_name == "cc":
|
||||
# MAP CANONICAL TO EXISTING (BETWEEN map_* AND self.aliases WE HAVE A BIJECTION)
|
||||
map_total = struct.inverse({t: self.alias(t) for t in total})
|
||||
map_remove = struct.inverse({r: self.alias(r) for r in remove})
|
||||
map_total = inverse({t: self.alias(t) for t in total})
|
||||
map_remove = inverse({r: self.alias(r) for r in remove})
|
||||
# CANONICAL VALUES
|
||||
c_total = set(map_total.keys())
|
||||
c_remove = set(map_remove.keys())
|
||||
|
@ -816,8 +816,8 @@ class BugHistoryParser():
|
|||
"type": valueType,
|
||||
"object": arrayDesc,
|
||||
"field_name": field_name,
|
||||
"missing": Q.sort(Q.map2set(diff, map_remove)),
|
||||
"existing": Q.sort(total),
|
||||
"missing": qb.sort(qb.map2set(diff, map_remove)),
|
||||
"existing": qb.sort(total),
|
||||
"candidates": {d: self.aliases.get(d, None) for d in diff},
|
||||
"bug_id": self.currBugID
|
||||
})
|
||||
|
@ -879,18 +879,18 @@ class BugHistoryParser():
|
|||
"diff": diff,
|
||||
"output": output
|
||||
})
|
||||
final_removed = Q.map2set(removed, map_total)
|
||||
final_removed = qb.map2set(removed, map_total)
|
||||
if final_removed:
|
||||
self.currActivity.changes.append({
|
||||
"field_name": field_name,
|
||||
"new_value": u", ".join(map(unicode, Q.sort(final_removed))),
|
||||
"new_value": u", ".join(map(unicode, qb.sort(final_removed))),
|
||||
"old_value": Null,
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
except Exception, email:
|
||||
Log.error("issues", email)
|
||||
|
||||
return Q.map2set(output, map_total)
|
||||
return qb.map2set(output, map_total)
|
||||
else:
|
||||
removed = total & remove
|
||||
diff = remove - total
|
||||
|
@ -899,7 +899,7 @@ class BugHistoryParser():
|
|||
if valueType == "added" and removed:
|
||||
self.currActivity.changes.append({
|
||||
"field_name": field_name,
|
||||
"new_value": u", ".join(map(unicode, Q.sort(removed))),
|
||||
"new_value": u", ".join(map(unicode, qb.sort(removed))),
|
||||
"old_value": Null,
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
|
@ -917,13 +917,13 @@ class BugHistoryParser():
|
|||
return output
|
||||
|
||||
def processFlags(self, total, old_values, new_values, modified_ts, modified_by, target_type, target):
|
||||
added_values = StructList() #FOR SOME REASON, REMOVAL BY OBJECT DOES NOT WORK, SO WE USE THIS LIST OF STRING VALUES
|
||||
added_values = DictList() #FOR SOME REASON, REMOVAL BY OBJECT DOES NOT WORK, SO WE USE THIS LIST OF STRING VALUES
|
||||
for v in new_values:
|
||||
flag = BugHistoryParser.makeFlag(v, modified_ts, modified_by)
|
||||
|
||||
if flag.request_type == None:
|
||||
Log.note("[Bug {{bug_id}}]: PROBLEM Unable to parse flag {{flag}} (caused by 255 char limit?)", {
|
||||
"flag": CNV.value2quote(flag.value),
|
||||
"flag": convert.value2quote(flag.value),
|
||||
"bug_id": self.currBugID
|
||||
})
|
||||
continue
|
||||
|
@ -940,7 +940,7 @@ class BugHistoryParser():
|
|||
else:
|
||||
Log.note("[Bug {{bug_id}}]: PROBLEM Unable to find {{type}} FLAG: {{object}}.{{field_name}}: (All {{missing}}" + " not in : {{existing}})", {
|
||||
"type": target_type,
|
||||
"object": nvl(target.attach_id, target.bug_id),
|
||||
"object": coalesce(target.attach_id, target.bug_id),
|
||||
"field_name": "flags",
|
||||
"missing": v,
|
||||
"existing": total,
|
||||
|
@ -951,21 +951,21 @@ class BugHistoryParser():
|
|||
if added_values:
|
||||
self.currActivity.changes.append({
|
||||
"field_name": "flags",
|
||||
"new_value": ", ".join(Q.sort(added_values.value)),
|
||||
"new_value": ", ".join(qb.sort(added_values.value)),
|
||||
"old_value": Null,
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
|
||||
if not old_values:
|
||||
return total
|
||||
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + CNV.object2JSON(someValues))
|
||||
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + convert.value2json(someValues))
|
||||
for v in old_values:
|
||||
total.append(BugHistoryParser.makeFlag(v, target.modified_ts, target.modified_by))
|
||||
|
||||
self.currActivity.changes.append({
|
||||
"field_name": "flags",
|
||||
"new_value": Null,
|
||||
"old_value": ", ".join(Q.sort(old_values)),
|
||||
"old_value": ", ".join(qb.sort(old_values)),
|
||||
"attach_id": target.attach_id
|
||||
})
|
||||
|
||||
|
@ -991,7 +991,7 @@ class BugHistoryParser():
|
|||
def alias(self, name):
|
||||
if name == None:
|
||||
return Null
|
||||
return nvl(self.aliases.get(name, Null).canonical, name)
|
||||
return coalesce(self.aliases.get(name, Null).canonical, name)
|
||||
|
||||
|
||||
def initializeAliases(self):
|
||||
|
@ -1000,7 +1000,7 @@ class BugHistoryParser():
|
|||
alias_json = File(self.settings.alias_file).read()
|
||||
except Exception, e:
|
||||
alias_json = "{}"
|
||||
self.aliases = {k: struct.wrap(v) for k, v in CNV.JSON2object(alias_json).items()}
|
||||
self.aliases = {k: wrap(v) for k, v in convert.json2value(alias_json).items()}
|
||||
|
||||
Log.note("{{num}} aliases loaded", {"num": len(self.aliases.keys())})
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
@ -8,29 +7,32 @@
|
|||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from pyLibrary.collections import MIN
|
||||
from pyLibrary.struct import nvl, Struct
|
||||
from pyLibrary.thread.threads import ThreadedQueue
|
||||
from pyLibrary.times.timer import Timer
|
||||
import transform_bugzilla
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.env import startup
|
||||
from pyLibrary.env.files import File
|
||||
from pyLibrary.collections.multiset import Multiset
|
||||
from pyLibrary.env.elasticsearch import ElasticSearch
|
||||
|
||||
|
||||
#
|
||||
# REPLICATION
|
||||
#
|
||||
# Replication has a few benefits:
|
||||
# 1) The slave can have scripting enabled, allowing more powerful set of queries
|
||||
# 2) Physical proximity increases the probability of reduced latency
|
||||
# 2) Physical proximity reduces latency
|
||||
# 3) The slave can be configured with better hardware
|
||||
# 4) The slave's exclusivity increases availability (Mozilla's public cluster my have time of high load)
|
||||
# 4) The slave's exclusivity increases availability (Mozilla's public cluster may have high load)
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from bzETL import transform_bugzilla
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.collections import MIN, Multiset
|
||||
from pyLibrary.debugs import startup
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import coalesce, Dict
|
||||
from pyLibrary.env import elasticsearch
|
||||
from pyLibrary.env.elasticsearch import Cluster
|
||||
from pyLibrary.env.files import File
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.thread.threads import ThreadedQueue
|
||||
from pyLibrary.times.timer import Timer
|
||||
|
||||
|
||||
far_back = datetime.utcnow() - timedelta(weeks=52)
|
||||
|
@ -39,12 +41,12 @@ BATCH_SIZE = 1000
|
|||
|
||||
def extract_from_file(source_settings, destination):
|
||||
file = File(source_settings.filename)
|
||||
for g, d in Q.groupby(file, size=BATCH_SIZE):
|
||||
for g, d in qb.groupby(file, size=BATCH_SIZE):
|
||||
try:
|
||||
d2 = map(
|
||||
lambda (x): {"id": x.id, "value": x},
|
||||
map(
|
||||
lambda(x): transform_bugzilla.normalize(CNV.JSON2object(x)),
|
||||
lambda(x): transform_bugzilla.normalize(convert.json2value(x)),
|
||||
d
|
||||
)
|
||||
)
|
||||
|
@ -61,8 +63,8 @@ def extract_from_file(source_settings, destination):
|
|||
|
||||
def get_last_updated(es):
|
||||
|
||||
if not isinstance(es, ElasticSearch):
|
||||
return CNV.milli2datetime(0)
|
||||
if not isinstance(es, elasticsearch.Index):
|
||||
return convert.milli2datetime(0)
|
||||
|
||||
try:
|
||||
results = es.search({
|
||||
|
@ -70,7 +72,7 @@ def get_last_updated(es):
|
|||
"query": {"match_all": {}},
|
||||
"filter": {
|
||||
"range": {
|
||||
"modified_ts": {"gte": CNV.datetime2milli(far_back)}}}
|
||||
"modified_ts": {"gte": convert.datetime2milli(far_back)}}}
|
||||
}},
|
||||
"from": 0,
|
||||
"size": 0,
|
||||
|
@ -79,8 +81,8 @@ def get_last_updated(es):
|
|||
})
|
||||
|
||||
if results.facets.modified_ts.count == 0:
|
||||
return CNV.milli2datetime(0)
|
||||
return CNV.milli2datetime(results.facets.modified_ts.max)
|
||||
return convert.milli2datetime(0)
|
||||
return convert.milli2datetime(results.facets.modified_ts.max)
|
||||
except Exception, e:
|
||||
Log.error("Can not get_last_updated from {{host}}/{{index}}",{
|
||||
"host": es.settings.host,
|
||||
|
@ -102,13 +104,13 @@ def get_pending(es, since):
|
|||
|
||||
pending_bugs = None
|
||||
|
||||
for s, e in Q.intervals(0, max_bug+1, 100000):
|
||||
for s, e in qb.intervals(0, max_bug+1, 100000):
|
||||
Log.note("Collect history for bugs from {{start}}..{{end}}", {"start":s, "end":e})
|
||||
result = es.search({
|
||||
"query": {"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": {"and":[
|
||||
{"range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}},
|
||||
{"range": {"modified_ts": {"gte": convert.datetime2milli(since)}}},
|
||||
{"range": {"bug_id": {"gte": s, "lte": e}}}
|
||||
]}
|
||||
}},
|
||||
|
@ -140,30 +142,30 @@ def get_pending(es, since):
|
|||
# USE THE source TO GET THE INDEX SCHEMA
|
||||
def get_or_create_index(destination_settings, source):
|
||||
#CHECK IF INDEX, OR ALIAS, EXISTS
|
||||
es = ElasticSearch(destination_settings)
|
||||
es = elasticsearch.Index(destination_settings)
|
||||
aliases = es.get_aliases()
|
||||
|
||||
indexes = [a for a in aliases if a.alias == destination_settings.index or a.index == destination_settings.index]
|
||||
if not indexes:
|
||||
#CREATE INDEX
|
||||
schema = CNV.JSON2object(File(destination_settings.schema_file).read(), paths=True)
|
||||
schema = convert.json2value(File(destination_settings.schema_file).read(), paths=True)
|
||||
assert schema.settings
|
||||
assert schema.mappings
|
||||
ElasticSearch.create_index(destination_settings, schema, limit_replicas=True)
|
||||
Cluster(destination_settings).create_index(destination_settings, schema, limit_replicas=True)
|
||||
elif len(indexes) > 1:
|
||||
Log.error("do not know how to replicate to more than one index")
|
||||
elif indexes[0].alias != None:
|
||||
destination_settings.alias = indexes[0].alias
|
||||
destination_settings.index = indexes[0].index
|
||||
|
||||
return ElasticSearch(destination_settings)
|
||||
return elasticsearch.Index(destination_settings)
|
||||
|
||||
|
||||
def replicate(source, destination, pending, last_updated):
|
||||
"""
|
||||
COPY source RECORDS TO destination
|
||||
"""
|
||||
for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
|
||||
for g, bugs in qb.groupby(pending, max_size=BATCH_SIZE):
|
||||
with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
|
||||
data = source.search({
|
||||
"query": {"filtered": {
|
||||
|
@ -171,7 +173,7 @@ def replicate(source, destination, pending, last_updated):
|
|||
"filter": {"and": [
|
||||
{"terms": {"bug_id": set(bugs)}},
|
||||
{"range": {"expires_on":
|
||||
{"gte": CNV.datetime2milli(last_updated)}
|
||||
{"gte": convert.datetime2milli(last_updated)}
|
||||
}}
|
||||
]}
|
||||
}},
|
||||
|
@ -197,12 +199,12 @@ def main(settings):
|
|||
#USE A SOURCE FILE
|
||||
if settings.source.filename != None:
|
||||
settings.destination.alias = settings.destination.index
|
||||
settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
|
||||
schema = CNV.JSON2object(File(settings.destination.schema_file).read(), paths=True, flexible=True)
|
||||
settings.destination.index = Cluster.proto_name(settings.destination.alias)
|
||||
schema = convert.json2value(File(settings.destination.schema_file).read(), paths=True, flexible=True)
|
||||
if transform_bugzilla.USE_ATTACHMENTS_DOT:
|
||||
schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))
|
||||
schema = convert.json2value(convert.value2json(schema).replace("attachments_", "attachments."))
|
||||
|
||||
dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
|
||||
dest = Cluster(settings.destination).create_index(settings.destination, schema, limit_replicas=True)
|
||||
dest.set_refresh_interval(-1)
|
||||
extract_from_file(settings.source, dest)
|
||||
dest.set_refresh_interval(1)
|
||||
|
@ -212,15 +214,15 @@ def main(settings):
|
|||
|
||||
else:
|
||||
# SYNCH WITH source ES INDEX
|
||||
source=ElasticSearch(settings.source)
|
||||
source=elasticsearch.Index(settings.source)
|
||||
|
||||
|
||||
# USE A DESTINATION FILE
|
||||
if settings.destination.filename:
|
||||
Log.note("Sending records to file: {{filename}}", {"filename":settings.destination.filename})
|
||||
file = File(settings.destination.filename)
|
||||
destination = Struct(
|
||||
extend=lambda x: file.extend([CNV.object2JSON(v["value"]) for v in x]),
|
||||
destination = Dict(
|
||||
extend=lambda x: file.extend([convert.value2json(v["value"]) for v in x]),
|
||||
file=file
|
||||
)
|
||||
else:
|
||||
|
@ -229,17 +231,17 @@ def main(settings):
|
|||
# GET LAST UPDATED
|
||||
from_file = None
|
||||
if time_file.exists:
|
||||
from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
|
||||
from_file = convert.milli2datetime(convert.value2int(time_file.read()))
|
||||
from_es = get_last_updated(destination) - timedelta(hours=1)
|
||||
last_updated = MIN(nvl(from_file, CNV.milli2datetime(0)), from_es)
|
||||
last_updated = MIN(coalesce(from_file, convert.milli2datetime(0)), from_es)
|
||||
Log.note("updating records with modified_ts>={{last_updated}}", {"last_updated":last_updated})
|
||||
|
||||
pending = get_pending(source, last_updated)
|
||||
with ThreadedQueue(destination, size=1000) as data_sink:
|
||||
with ThreadedQueue(destination, max_size=1000) as data_sink:
|
||||
replicate(source, data_sink, pending, last_updated)
|
||||
|
||||
# RECORD LAST UPDATED
|
||||
time_file.write(unicode(CNV.datetime2milli(current_time)))
|
||||
time_file.write(unicode(convert.datetime2milli(current_time)))
|
||||
|
||||
|
||||
def start():
|
||||
|
|
|
@ -1,12 +1,23 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from datetime import date
|
||||
import re
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.env import elasticsearch
|
||||
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.env import elasticsearch
|
||||
from pyLibrary.queries import qb
|
||||
|
||||
|
||||
USE_ATTACHMENTS_DOT = True
|
||||
|
@ -34,7 +45,7 @@ DATE_PATTERN_RELAXED = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2}")
|
|||
def rename_attachments(bug_version):
|
||||
if bug_version.attachments == None: return bug_version
|
||||
if not USE_ATTACHMENTS_DOT:
|
||||
bug_version.attachments=CNV.JSON2object(CNV.object2JSON(bug_version.attachments).replace("attachments.", "attachments_"))
|
||||
bug_version.attachments=convert.json2value(convert.value2json(bug_version.attachments).replace("attachments.", "attachments_"))
|
||||
return bug_version
|
||||
|
||||
|
||||
|
@ -47,31 +58,31 @@ def normalize(bug, old_school=False):
|
|||
|
||||
#ENSURE STRUCTURES ARE SORTED
|
||||
# Do some processing to make sure that diffing between runs stays as similar as possible.
|
||||
bug.flags=Q.sort(bug.flags, "value")
|
||||
bug.flags=qb.sort(bug.flags, "value")
|
||||
|
||||
if bug.attachments:
|
||||
if USE_ATTACHMENTS_DOT:
|
||||
bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments."))
|
||||
bug.attachments = Q.sort(bug.attachments, "attach_id")
|
||||
bug.attachments=convert.json2value(convert.value2json(bug.attachments).replace("attachments_", "attachments."))
|
||||
bug.attachments = qb.sort(bug.attachments, "attach_id")
|
||||
for a in bug.attachments:
|
||||
for k,v in list(a.items()):
|
||||
if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")):
|
||||
new_v=CNV.value2int(v)
|
||||
new_v=convert.value2int(v)
|
||||
new_k=k[12:]
|
||||
a[k.replace(".", "\.")]=new_v
|
||||
if not old_school:
|
||||
a[new_k]=new_v
|
||||
a.flags = Q.sort(a.flags, ["modified_ts", "value"])
|
||||
a.flags = qb.sort(a.flags, ["modified_ts", "value"])
|
||||
|
||||
if bug.changes != None:
|
||||
if USE_ATTACHMENTS_DOT:
|
||||
json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.")
|
||||
bug.changes=CNV.JSON2object(json)
|
||||
bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])
|
||||
json = convert.value2json(bug.changes).replace("attachments_", "attachments.")
|
||||
bug.changes=convert.json2value(json)
|
||||
bug.changes = qb.sort(bug.changes, ["attach_id", "field_name"])
|
||||
|
||||
#bug IS CONVERTED TO A 'CLEAN' COPY
|
||||
bug = elasticsearch.scrub(bug)
|
||||
# bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST
|
||||
# bug.attachments = coalesce(bug.attachments, []) # ATTACHMENTS MUST EXIST
|
||||
|
||||
|
||||
for f in NUMERIC_FIELDS:
|
||||
|
@ -79,11 +90,11 @@ def normalize(bug, old_school=False):
|
|||
if v == None:
|
||||
continue
|
||||
elif f in MULTI_FIELDS:
|
||||
bug[f] = CNV.value2intlist(v)
|
||||
elif CNV.value2number(v) == 0:
|
||||
bug[f] = convert.value2intlist(v)
|
||||
elif convert.value2number(v) == 0:
|
||||
del bug[f]
|
||||
else:
|
||||
bug[f]=CNV.value2number(v)
|
||||
bug[f]=convert.value2number(v)
|
||||
|
||||
# Also reformat some date fields
|
||||
for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
|
||||
|
@ -91,7 +102,7 @@ def normalize(bug, old_school=False):
|
|||
if v == None: continue
|
||||
try:
|
||||
if isinstance(v, date):
|
||||
bug[dateField] = CNV.datetime2milli(v)
|
||||
bug[dateField] = convert.datetime2milli(v)
|
||||
elif isinstance(v, (long, int, float)) and len(unicode(v)) in [12, 13]:
|
||||
bug[dateField] = v
|
||||
elif not isinstance(v, basestring):
|
||||
|
@ -100,17 +111,17 @@ def normalize(bug, old_school=False):
|
|||
# Convert to "2012/01/01 00:00:00.000"
|
||||
# Example: bug 856732 (cf_last_resolved)
|
||||
# dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
|
||||
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
|
||||
bug[dateField] = convert.datetime2milli(convert.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
|
||||
elif DATE_PATTERN_STRICT_SHORT.match(v):
|
||||
# Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
|
||||
# Example: bug 856732 (cf_last_resolved)
|
||||
# dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
|
||||
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
|
||||
bug[dateField] = convert.datetime2milli(convert.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
|
||||
elif DATE_PATTERN_RELAXED.match(v):
|
||||
# Convert "2012/01/01 00:00:00.000" to "2012-01-01"
|
||||
# Example: bug 643420 (deadline)
|
||||
# bug 726635 (cf_due_date)
|
||||
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
|
||||
bug[dateField] = convert.datetime2milli(convert.string2datetime(v[0:10], "%Y-%m-%d"))
|
||||
except Exception, e:
|
||||
Log.error("problem with converting date to milli (type={{type}}, value={{value}})", {"value":bug[dateField], "type":type(bug[dateField]).name}, e)
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ Module `meta`
|
|||
|
||||
**Description**
|
||||
|
||||
`@use_settings` will decorate a function to accept a `settings` parameter which is just like `**kwargs`, but the named parameters can override the properties in `settings`, rather than raise duplicate keyname exceptions.
|
||||
`@use_settings` will decorate a function to accept a `settings` parameter which is just like `**kwargs`, but the other parameters can override the properties in `settings`, rather than raise duplicate keyname exceptions.
|
||||
|
||||
|
||||
**Example**
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from __future__ import absolute_import
|
||||
|
||||
import HTMLParser
|
||||
import StringIO
|
||||
|
@ -23,11 +22,12 @@ import gzip
|
|||
import hashlib
|
||||
from io import BytesIO
|
||||
import json
|
||||
from numbers import Number
|
||||
import re
|
||||
from tempfile import TemporaryFile
|
||||
|
||||
from pyLibrary import strings, meta
|
||||
from pyLibrary.dot import wrap, wrap_dot, unwrap
|
||||
from pyLibrary.dot import wrap, wrap_dot, unwrap, unwraplist
|
||||
from pyLibrary.collections.multiset import Multiset
|
||||
from pyLibrary.debugs.logs import Log, Except
|
||||
from pyLibrary.env.big_data import FileString, safe_size
|
||||
|
@ -103,6 +103,7 @@ def json2value(json_string, params={}, flexible=False, paths=False):
|
|||
if params:
|
||||
json_string = expand_template(json_string, params)
|
||||
|
||||
|
||||
# LOOKUP REFERENCES
|
||||
value = wrap(json_decoder(json_string))
|
||||
|
||||
|
@ -113,10 +114,10 @@ def json2value(json_string, params={}, flexible=False, paths=False):
|
|||
|
||||
except Exception, e:
|
||||
e = Except.wrap(e)
|
||||
if ("Expecting '" in e and "' delimiter: line" in e) or "Expecting property name enclosed in double quotes: " in e:
|
||||
if "Expecting '" in e and "' delimiter: line" in e:
|
||||
line_index = int(strings.between(e.message, " line ", " column ")) - 1
|
||||
column = int(strings.between(e.message, " column ", " ")) - 1
|
||||
line = json_string.split("\n")[line_index].replace("\t", " ")
|
||||
line = json_string.split("\n")[line_index]
|
||||
if column > 20:
|
||||
sample = "..." + line[column - 20:]
|
||||
pointer = " " + (" " * 20) + "^"
|
||||
|
@ -243,22 +244,52 @@ def list2tab(rows):
|
|||
return "\t".join(keys) + "\n" + "\n".join(output)
|
||||
|
||||
|
||||
def list2table(rows):
|
||||
def list2table(rows, column_names=None):
|
||||
if column_names:
|
||||
keys = list(set(column_names))
|
||||
else:
|
||||
columns = set()
|
||||
for r in rows:
|
||||
columns |= set(r.keys())
|
||||
keys = list(columns)
|
||||
|
||||
output = []
|
||||
for r in rows:
|
||||
output.append([r[k] for k in keys])
|
||||
output = [[unwraplist(r[k]) for k in keys] for r in rows]
|
||||
|
||||
return wrap({
|
||||
"meta": {"format": "table"},
|
||||
"header": keys,
|
||||
"data": output
|
||||
})
|
||||
|
||||
|
||||
def list2cube(rows, column_names=None):
|
||||
if column_names:
|
||||
keys = column_names
|
||||
else:
|
||||
columns = set()
|
||||
for r in rows:
|
||||
columns |= set(r.keys())
|
||||
keys = list(columns)
|
||||
|
||||
data = {k: [] for k in keys}
|
||||
output = wrap({
|
||||
"meta": {"format": "cube"},
|
||||
"edges": [
|
||||
{
|
||||
"name": "rownum",
|
||||
"domain": {"type": "rownum", "min": 0, "max": len(rows), "interval": 1}
|
||||
}
|
||||
],
|
||||
"data": data
|
||||
})
|
||||
|
||||
for r in rows:
|
||||
for k in keys:
|
||||
data[k].append(r[k])
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def value2string(value):
|
||||
# PROPER NULL HANDLING
|
||||
if value == None:
|
||||
|
@ -443,11 +474,15 @@ def bytes2sha1(value):
|
|||
def value2intlist(value):
|
||||
if value == None:
|
||||
return None
|
||||
elif isinstance(value, Number):
|
||||
return [int(value)]
|
||||
elif isinstance(value, basestring):
|
||||
if value.strip() == "":
|
||||
return None
|
||||
return [int(value)]
|
||||
elif hasattr(value, '__iter__'):
|
||||
output = [int(d) for d in value if d != "" and d != None]
|
||||
return output
|
||||
elif value.strip() == "":
|
||||
return None
|
||||
else:
|
||||
return [int(value)]
|
||||
|
||||
|
@ -547,7 +582,7 @@ def ini2value(ini_content):
|
|||
|
||||
buff = StringIO.StringIO(ini_content)
|
||||
config = ConfigParser()
|
||||
config.read(buff, "dummy")
|
||||
config._read(buff, "dummy")
|
||||
|
||||
output = {}
|
||||
for section in config.sections():
|
||||
|
|
|
@ -28,6 +28,7 @@ class Log_usingElasticSearch(BaseLog):
|
|||
self.es = Cluster(settings).get_or_create_index(
|
||||
schema=convert.json2value(convert.value2json(SCHEMA), paths=True),
|
||||
limit_replicas=True,
|
||||
tjson=True,
|
||||
settings=settings
|
||||
)
|
||||
self.queue = self.es.threaded_queue(max_size=max_size, batch_size=batch_size)
|
||||
|
@ -60,7 +61,7 @@ class Log_usingElasticSearch(BaseLog):
|
|||
|
||||
SCHEMA = {
|
||||
"settings": {
|
||||
"index.number_of_shards": 3,
|
||||
"index.number_of_shards": 1,
|
||||
"index.number_of_replicas": 2,
|
||||
"index.store.throttle.type": "merge",
|
||||
"index.store.throttle.max_bytes_per_sec": "2mb",
|
||||
|
@ -73,13 +74,38 @@ SCHEMA = {
|
|||
{
|
||||
"values_strings": {
|
||||
"match": "*",
|
||||
"match_mapping_type" : "string",
|
||||
"match_mapping_type": "string",
|
||||
"mapping": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"index": "not_analyzed",
|
||||
"doc_values": True
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"default_doubles": {
|
||||
"mapping": {
|
||||
"index": "not_analyzed",
|
||||
"type": "double",
|
||||
"doc_values": True
|
||||
},
|
||||
"match_mapping_type": "double",
|
||||
"match": "*"
|
||||
}
|
||||
},
|
||||
{
|
||||
"default_longs": {
|
||||
"mapping": {
|
||||
"index": "not_analyzed",
|
||||
"type": "long",
|
||||
"doc_values": True
|
||||
},
|
||||
"match_mapping_type": "long|integer",
|
||||
"match_pattern": "regex",
|
||||
"path_match": ".*"
|
||||
}
|
||||
}
|
||||
|
||||
],
|
||||
"_all": {
|
||||
"enabled": False
|
||||
|
@ -90,11 +116,17 @@ SCHEMA = {
|
|||
},
|
||||
"properties": {
|
||||
"timestamp": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"$value": {
|
||||
"type": "double",
|
||||
"index": "not_analyzed",
|
||||
"store": "yes"
|
||||
"store": "yes",
|
||||
"doc_values": True
|
||||
}
|
||||
}
|
||||
},
|
||||
"params": {
|
||||
"params": { # JUST IN CASE WE ARE NOT USING TYPED JSON
|
||||
"type": "object",
|
||||
"enabled": False,
|
||||
"index": "no",
|
||||
|
|
|
@ -270,6 +270,7 @@ class Log(object):
|
|||
default_params = {}
|
||||
|
||||
params = dict(unwrap(default_params), **more_params)
|
||||
|
||||
add_to_trace = False
|
||||
if cause == None:
|
||||
cause = []
|
||||
|
|
|
@ -60,7 +60,7 @@ def read_settings(filename=None, defs=None):
|
|||
Log.error("Can not file settings file {{filename}}", {
|
||||
"filename": settings_file.abspath
|
||||
})
|
||||
settings = ref.get("file://" + settings_file.abspath)
|
||||
settings = ref.get("file:///" + settings_file.abspath.replace(os.sep, "/"))
|
||||
if defs:
|
||||
settings.args = _argparse(defs)
|
||||
return settings
|
||||
|
|
|
@ -68,7 +68,7 @@ different names and slightly different variations, some examples are:
|
|||
* `jinja2.environment.Environment.getattr()` to allow convenient dot notation
|
||||
* `argparse.Environment()` - code performs `setattr(e, name, value)` on
|
||||
instances of Environment to provide dot(`.`) accessors
|
||||
* `collections.namedtuple()` - gives attribute names to tuple indicies
|
||||
* `collections.namedtuple()` - gives attribute names to tuple indices
|
||||
effectively providing <code>a.b</code> rather than <code>a["b"]</code>
|
||||
offered by dicts
|
||||
* [configman's DotDict](https://github.com/mozilla/configman/blob/master/configman/dotdict.py)
|
||||
|
@ -131,7 +131,9 @@ replaced with `None` in all cases.
|
|||
|
||||
###Identity and Absorbing (Zero) Elements###
|
||||
|
||||
With closure we can realize we have defined an algebraic semigroup: The identity element is the dot string (`"."`) and the zero element is `Null` (or `None`).
|
||||
With closure we can realize we have defined an algebraic semigroup: The
|
||||
identity element is the dot string (`"."`) and the zero element is `Null`
|
||||
(or `None`).
|
||||
|
||||
1. `a[Null] == Null`
|
||||
2. `a["."] == a`
|
||||
|
@ -208,7 +210,7 @@ all `a<=b`
|
|||
* Trinary slicing `[::]` uses the flat list definition
|
||||
|
||||
When assuming a *flat-list*, we loose the *take-from-the-right* tricks gained
|
||||
from modulo arithmetic on the indicies. Therefore, we require extra methods
|
||||
from modulo arithmetic on the indices. Therefore, we require extra methods
|
||||
to perform right-based slicing:
|
||||
|
||||
* **right()** - `flat_list.right(b)` same as `loop_list[-b:]` except when `b<=0`
|
||||
|
@ -231,9 +233,17 @@ The dot operator on a `DictList` performs a simple projection; it will return a
|
|||
DictObject for data
|
||||
-------------------
|
||||
|
||||
There are two major families of objects in Object Oriented programming. The first, are ***Actors***: characterized by a number of useful instance methods and some state bundled into a package. The second are ***Data***: Primarily a set of properties, with only (de)serialization functions, or algebraic operators defined. Boto has many examples of these *Data* classes, [here is one](https://github.com/boto/boto/blob/4b8269562e663f090403e57ba1a3a471b6e0aa0e/boto/ec2/networkinterface.py).
|
||||
There are two major families of objects in Object Oriented programming. The
|
||||
first, are ***Actors***: characterized by a number of useful instance methods
|
||||
and some state bundled into a package. The second are ***Data***: Primarily
|
||||
a set of properties, with only (de)serialization functions, or algebraic
|
||||
operators defined. Boto has many examples of these *Data* classes,
|
||||
[here is one](https://github.com/boto/boto/blob/4b8269562e663f090403e57ba1a3a471b6e0aa0e/boto/ec2/networkinterface.py).
|
||||
|
||||
The problem with *Data* objects is they have an useless distinction between attributes and properties. This prevents us from using the `[]` operator for dereferencing, forcing use to use the verbose `__getattr__()` instead. It also prevents the use of query operators over these objects.
|
||||
The problem with *Data* objects is they have an useless distinction between
|
||||
attributes and properties. This prevents us from using the `[]` operator for
|
||||
dereferencing, forcing use to use the verbose `getattr()` instead. It
|
||||
also prevents the use of query operators over these objects.
|
||||
|
||||
You can register a class as a *data* class, by wrapping it with `DictClass`.
|
||||
|
||||
|
|
|
@ -64,7 +64,9 @@ def split_field(field):
|
|||
"""
|
||||
RETURN field AS ARRAY OF DOT-SEPARATED FIELDS
|
||||
"""
|
||||
if field.find(".") >= 0:
|
||||
if field == "." or field==None:
|
||||
return []
|
||||
elif field.find(".") >= 0:
|
||||
field = field.replace("\.", "\a")
|
||||
return [k.replace("\a", ".") for k in field.split(".")]
|
||||
else:
|
||||
|
@ -75,7 +77,10 @@ def join_field(field):
|
|||
"""
|
||||
RETURN field SEQUENCE AS STRING
|
||||
"""
|
||||
return ".".join([f.replace(".", "\.") for f in field])
|
||||
potent = [f for f in field if f != "."]
|
||||
if not potent:
|
||||
return "."
|
||||
return ".".join([f.replace(".", "\.") for f in potent])
|
||||
|
||||
|
||||
def hash_value(v):
|
||||
|
@ -128,7 +133,13 @@ def _all_default(d, default, seen=None):
|
|||
|
||||
if existing_value == None:
|
||||
if default_value != None:
|
||||
try:
|
||||
_set_attr(d, [k], default_value)
|
||||
except Exception, e:
|
||||
if PATH_NOT_FOUND not in e:
|
||||
from pyLibrary.debugs.logs import Log
|
||||
Log.error("Can not set attribute {{name}}", name=k, cause=e)
|
||||
|
||||
elif (hasattr(existing_value, "__setattr__") or isinstance(existing_value, Mapping)) and isinstance(default_value, Mapping):
|
||||
df = seen.get(id(existing_value))
|
||||
if df:
|
||||
|
@ -143,13 +154,13 @@ def _getdefault(obj, key):
|
|||
TRY BOTH ATTRIBUTE AND ITEM ACCESS, OR RETURN Null
|
||||
"""
|
||||
try:
|
||||
return getattr(obj, key)
|
||||
except Exception, e:
|
||||
return obj[key]
|
||||
except Exception, f:
|
||||
pass
|
||||
|
||||
try:
|
||||
return obj[key]
|
||||
except Exception, f:
|
||||
return getattr(obj, key)
|
||||
except Exception, e:
|
||||
pass
|
||||
|
||||
try:
|
||||
|
@ -242,6 +253,8 @@ def _get_attr(obj, path):
|
|||
obj = getattr(obj, attr_name)
|
||||
return _get_attr(obj, path[1:])
|
||||
except Exception, e:
|
||||
pass
|
||||
|
||||
try:
|
||||
obj = obj[attr_name]
|
||||
return _get_attr(obj, path[1:])
|
||||
|
@ -270,7 +283,7 @@ def _set_attr(obj, path, value):
|
|||
new_value = value
|
||||
|
||||
try:
|
||||
_get(obj, "__setattr__")(attr_name, new_value)
|
||||
setattr(obj, attr_name, new_value)
|
||||
return old_value
|
||||
except Exception, e:
|
||||
try:
|
||||
|
|
|
@ -70,13 +70,19 @@ class Dict(MutableMapping):
|
|||
def __getitem__(self, key):
|
||||
if key == None:
|
||||
return Null
|
||||
if key == ".":
|
||||
output = _get(self, "_dict")
|
||||
if isinstance(output, Mapping):
|
||||
return self
|
||||
else:
|
||||
return output
|
||||
|
||||
if isinstance(key, str):
|
||||
key = key.decode("utf8")
|
||||
elif not isinstance(key, unicode):
|
||||
from pyLibrary.debugs.logs import Log
|
||||
Log.error("only string keys are supported")
|
||||
|
||||
|
||||
d = _get(self, "_dict")
|
||||
|
||||
if key.find(".") >= 0:
|
||||
|
@ -96,6 +102,13 @@ class Dict(MutableMapping):
|
|||
from pyLibrary.debugs.logs import Log
|
||||
|
||||
Log.error("key is empty string. Probably a bad idea")
|
||||
if key == ".":
|
||||
# SOMETHING TERRIBLE HAPPENS WHEN value IS NOT A Mapping;
|
||||
# HOPEFULLY THE ONLY OTHER METHOD RUN ON self IS unwrap()
|
||||
v = unwrap(value)
|
||||
_set(self, "_dict", v)
|
||||
return v
|
||||
|
||||
if isinstance(key, str):
|
||||
key = key.decode("utf8")
|
||||
|
||||
|
@ -257,13 +270,13 @@ class Dict(MutableMapping):
|
|||
try:
|
||||
return "Dict("+dict.__str__(_get(self, "_dict"))+")"
|
||||
except Exception, e:
|
||||
return "{}"
|
||||
return "Dict{}"
|
||||
|
||||
def __repr__(self):
|
||||
try:
|
||||
return "Dict("+dict.__repr__(_get(self, "_dict"))+")"
|
||||
except Exception, e:
|
||||
return "Dict{}"
|
||||
return "Dict()"
|
||||
|
||||
|
||||
class _DictUsingSelf(dict):
|
||||
|
@ -460,7 +473,6 @@ class _DictUsingSelf(dict):
|
|||
return "Dict()"
|
||||
|
||||
|
||||
|
||||
# KEEP TRACK OF WHAT ATTRIBUTES ARE REQUESTED, MAYBE SOME (BUILTIN) ARE STILL USEFUL
|
||||
requested = set()
|
||||
|
||||
|
|
|
@ -18,14 +18,14 @@ from pyLibrary.dot import wrap, unwrap
|
|||
|
||||
_get = object.__getattribute__
|
||||
_set = object.__setattr__
|
||||
dictwrap = None
|
||||
_dictwrap = None
|
||||
|
||||
|
||||
def _late_import():
|
||||
global dictwrap
|
||||
from pyLibrary.dot.objects import dictwrap
|
||||
global _dictwrap
|
||||
from pyLibrary.dot.objects import dictwrap as _dictwrap
|
||||
|
||||
_ = dictwrap
|
||||
_ = _dictwrap
|
||||
|
||||
class DictList(list):
|
||||
"""
|
||||
|
@ -82,10 +82,10 @@ class DictList(list):
|
|||
"""
|
||||
simple `select`
|
||||
"""
|
||||
if not dictwrap:
|
||||
if not _dictwrap:
|
||||
_late_import()
|
||||
|
||||
return DictList(vals=[unwrap(dictwrap(v)[key]) for v in _get(self, "list")])
|
||||
return DictList(vals=[unwrap(_dictwrap(v)[key]) for v in _get(self, "list")])
|
||||
|
||||
def filter(self, _filter):
|
||||
return DictList(vals=[unwrap(u) for u in (wrap(v) for v in _get(self, "list")) if _filter(u)])
|
||||
|
@ -112,6 +112,9 @@ class DictList(list):
|
|||
Log.warning("slicing is broken in Python 2.7: a[i:j] == a[i+len(a), j] sometimes. Use [start:stop:step] (see https://github.com/klahnakoski/pyLibrary/blob/master/pyLibrary/dot/README.md#the-slice-operator-in-python27-is-inconsistent)")
|
||||
return self[i:j:]
|
||||
|
||||
def __list__(self):
|
||||
return self.list
|
||||
|
||||
def copy(self):
|
||||
return DictList(list(_get(self, "list")))
|
||||
|
||||
|
|
|
@ -22,6 +22,9 @@ WRAPPED_CLASSES = set()
|
|||
|
||||
|
||||
class DictObject(Mapping):
|
||||
"""
|
||||
TREAT AN OBJECT LIKE DATA
|
||||
"""
|
||||
|
||||
def __init__(self, obj):
|
||||
_set(self, "_obj", obj)
|
||||
|
@ -90,12 +93,16 @@ def dictwrap(v):
|
|||
m = Dict()
|
||||
_set(m, "_dict", v) # INJECT m.__dict__=v SO THERE IS NO COPY
|
||||
return m
|
||||
elif type_ is Dict:
|
||||
return v
|
||||
elif type_ is NoneType:
|
||||
return None # So we allow `is None`
|
||||
return None # So we allow `is None` (OFTEN USED IN PYTHON LIBRARIES)
|
||||
elif type_ is list:
|
||||
return DictList(v)
|
||||
elif type_ is GeneratorType:
|
||||
return (wrap(vv) for vv in v)
|
||||
elif hasattr(v, "as_dict"):
|
||||
return v.as_dict()
|
||||
elif isinstance(v, (basestring, int, float, Decimal, datetime, date, Dict, DictList, NullType, NoneType)):
|
||||
return v
|
||||
else:
|
||||
|
|
|
@ -18,19 +18,28 @@ import time
|
|||
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import coalesce, Null, Dict, set_default, join_field, split_field
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap
|
||||
from pyLibrary.env import http
|
||||
from pyLibrary.jsons.typed_encoder import json2typed
|
||||
from pyLibrary.maths.randoms import Random
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.meta import use_settings
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.strings import utf82unicode
|
||||
from pyLibrary.dot import coalesce, Null, Dict, set_default
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, unwrap
|
||||
from pyLibrary.thread.threads import ThreadedQueue, Thread
|
||||
from pyLibrary.thread.threads import ThreadedQueue, Thread, Lock
|
||||
|
||||
|
||||
class Index(object):
|
||||
ES_NUMERIC_TYPES = ["long", "integer", "double", "float"]
|
||||
ES_PRIMITIVE_TYPES = ["string", "boolean", "integer", "date", "long", "double"]
|
||||
|
||||
|
||||
class Features(object):
|
||||
pass
|
||||
|
||||
|
||||
class Index(Features):
|
||||
"""
|
||||
AN ElasticSearch INDEX LIFETIME MANAGEMENT TOOL
|
||||
|
||||
|
@ -53,18 +62,17 @@ class Index(object):
|
|||
alias=None,
|
||||
explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED
|
||||
read_only=True,
|
||||
tjson=False, # STORED AS TYPED JSON
|
||||
timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
|
||||
debug=False, # DO NOT SHOW THE DEBUG STATEMENTS
|
||||
settings=None
|
||||
):
|
||||
|
||||
if index==None:
|
||||
Log.error("not allowed")
|
||||
if index == alias:
|
||||
Log.error("must have a unique index name")
|
||||
|
||||
self.cluster_state = None
|
||||
self.cluster_metadata = None
|
||||
self.debug = debug
|
||||
if self.debug:
|
||||
Log.alert("elasticsearch debugging for index {{index}} is on", index=settings.index)
|
||||
|
@ -73,15 +81,18 @@ class Index(object):
|
|||
self.cluster = Cluster(settings)
|
||||
|
||||
try:
|
||||
index = self.get_index(index)
|
||||
if index and alias==None:
|
||||
full_index = self.get_index(index)
|
||||
if full_index and alias==None:
|
||||
settings.alias = settings.index
|
||||
settings.index = index
|
||||
if index == None:
|
||||
settings.index = full_index
|
||||
if full_index==None:
|
||||
Log.error("not allowed")
|
||||
if type == None:
|
||||
# NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT?
|
||||
indices = self.cluster.get_metadata().indices
|
||||
with self.cluster.metadata_locker:
|
||||
index_ = self.cluster._metadata.indices[self.settings.index]
|
||||
if not index_:
|
||||
indices = self.cluster.get_metadata(index=self.settings.index).indices
|
||||
index_ = indices[self.settings.index]
|
||||
|
||||
candidate_types = list(index_.mappings.keys())
|
||||
|
@ -90,13 +101,16 @@ class Index(object):
|
|||
self.settings.type = type = candidate_types[0]
|
||||
except Exception, e:
|
||||
# EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
|
||||
pass
|
||||
Log.error("not expected", cause=e)
|
||||
|
||||
self.path = "/" + index + "/" + type
|
||||
if not type:
|
||||
Log.error("not allowed")
|
||||
|
||||
self.path = "/" + full_index + "/" + type
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self.cluster.path + "/" + self.path
|
||||
return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/")
|
||||
|
||||
def get_schema(self, retry=True):
|
||||
if self.settings.explore_metadata:
|
||||
|
@ -134,25 +148,25 @@ class Index(object):
|
|||
def add_alias(self, alias=None):
|
||||
if alias:
|
||||
self.cluster_state = None
|
||||
self.cluster._post(
|
||||
self.cluster.post(
|
||||
"/_aliases",
|
||||
data=convert.unicode2utf8(convert.value2json({
|
||||
data={
|
||||
"actions": [
|
||||
{"add": {"index": self.settings.index, "alias": alias}}
|
||||
]
|
||||
})),
|
||||
},
|
||||
timeout=coalesce(self.settings.timeout, 30)
|
||||
)
|
||||
else:
|
||||
# SET ALIAS ACCORDING TO LIFECYCLE RULES
|
||||
self.cluster_state = None
|
||||
self.cluster._post(
|
||||
self.cluster.post(
|
||||
"/_aliases",
|
||||
data=convert.unicode2utf8(convert.value2json({
|
||||
data={
|
||||
"actions": [
|
||||
{"add": {"index": self.settings.index, "alias": self.settings.alias}}
|
||||
]
|
||||
})),
|
||||
},
|
||||
timeout=coalesce(self.settings.timeout, 30)
|
||||
)
|
||||
|
||||
|
@ -160,9 +174,10 @@ class Index(object):
|
|||
"""
|
||||
RETURN THE INDEX USED BY THIS alias
|
||||
"""
|
||||
alias_list = self.cluster.get_aliases()
|
||||
output = sort([
|
||||
a.index
|
||||
for a in self.cluster.get_aliases()
|
||||
for a in alias_list
|
||||
if a.alias == alias or
|
||||
a.index == alias or
|
||||
(re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias)
|
||||
|
@ -186,7 +201,7 @@ class Index(object):
|
|||
return True
|
||||
|
||||
def flush(self):
|
||||
self.cluster._post("/" + self.settings.index + "/_refresh")
|
||||
self.cluster.post("/" + self.settings.index + "/_refresh")
|
||||
|
||||
def delete_record(self, filter):
|
||||
if self.settings.read_only:
|
||||
|
@ -250,6 +265,9 @@ class Index(object):
|
|||
Log.error("Expecting every record given to have \"value\" or \"json\" property")
|
||||
|
||||
lines.append('{"index":{"_id": ' + convert.value2json(id) + '}}')
|
||||
if self.settings.tjson:
|
||||
lines.append(json2typed(json))
|
||||
else:
|
||||
lines.append(json)
|
||||
del records
|
||||
|
||||
|
@ -263,7 +281,7 @@ class Index(object):
|
|||
Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e)
|
||||
|
||||
|
||||
response = self.cluster._post(
|
||||
response = self.cluster.post(
|
||||
self.path + "/_bulk",
|
||||
data=data_bytes,
|
||||
headers={"Content-Type": "text"},
|
||||
|
@ -279,7 +297,7 @@ class Index(object):
|
|||
error=item.index.error,
|
||||
line=lines[i * 2 + 1]
|
||||
)
|
||||
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
|
||||
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
|
||||
if item.index.status not in [200, 201]:
|
||||
Log.error(
|
||||
"{{num}} {{error}} while loading line into {{index}}:\n{{line}}",
|
||||
|
@ -323,7 +341,7 @@ class Index(object):
|
|||
Log.error("Can not set refresh interval ({{error}})", {
|
||||
"error": utf82unicode(response.all_content)
|
||||
})
|
||||
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
|
||||
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
|
||||
response = self.cluster.put(
|
||||
"/" + self.settings.index + "/_settings",
|
||||
data=convert.unicode2utf8('{"index":{"refresh_interval":' + convert.value2json(interval) + '}}')
|
||||
|
@ -347,9 +365,9 @@ class Index(object):
|
|||
else:
|
||||
show_query = query
|
||||
Log.note("Query:\n{{query|indent}}", query=show_query)
|
||||
return self.cluster._post(
|
||||
return self.cluster.post(
|
||||
self.path + "/_search",
|
||||
data=convert.value2json(query).encode("utf8"),
|
||||
data=query,
|
||||
timeout=coalesce(timeout, self.settings.timeout)
|
||||
)
|
||||
except Exception, e:
|
||||
|
@ -374,24 +392,41 @@ class Index(object):
|
|||
self.cluster.delete_index(index=self.settings.index)
|
||||
|
||||
|
||||
known_clusters = {}
|
||||
|
||||
class Cluster(object):
|
||||
|
||||
@use_settings
|
||||
def __init__(self, host, port=9200, settings=None):
|
||||
def __new__(cls, host, port=9200, settings=None):
|
||||
if not isinstance(port, int):
|
||||
Log.error("port must be integer")
|
||||
cluster = known_clusters.get((host, port))
|
||||
if cluster:
|
||||
return cluster
|
||||
|
||||
cluster = object.__new__(cls)
|
||||
known_clusters[(host, port)] = cluster
|
||||
return cluster
|
||||
|
||||
@use_settings
|
||||
def __init__(self, host, port=9200, explore_metadata=True, settings=None):
|
||||
"""
|
||||
settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
|
||||
settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
|
||||
"""
|
||||
if hasattr(self, "settings"):
|
||||
return
|
||||
|
||||
settings.setdefault("explore_metadata", True)
|
||||
|
||||
self.cluster_state = None
|
||||
self.cluster_metadata = None
|
||||
|
||||
self.debug = settings.debug
|
||||
self.settings = settings
|
||||
self.cluster_state = None
|
||||
self._metadata = None
|
||||
self.metadata_locker = Lock()
|
||||
self.debug = settings.debug
|
||||
self.version = None
|
||||
self.path = settings.host + ":" + unicode(settings.port)
|
||||
|
||||
self.get_metadata()
|
||||
|
||||
@use_settings
|
||||
def get_or_create_index(
|
||||
self,
|
||||
|
@ -400,6 +435,7 @@ class Cluster(object):
|
|||
schema=None,
|
||||
limit_replicas=None,
|
||||
read_only=False,
|
||||
tjson=False,
|
||||
settings=None
|
||||
):
|
||||
best = self._get_best(settings)
|
||||
|
@ -489,6 +525,7 @@ class Cluster(object):
|
|||
schema=None,
|
||||
limit_replicas=None,
|
||||
read_only=False,
|
||||
tjson=False,
|
||||
settings=None
|
||||
):
|
||||
if not settings.alias:
|
||||
|
@ -518,7 +555,7 @@ class Cluster(object):
|
|||
)
|
||||
schema.settings.index.number_of_replicas = health.number_of_nodes - 1
|
||||
|
||||
self._post(
|
||||
self.post(
|
||||
"/" + settings.index,
|
||||
data=convert.value2json(schema).encode("utf8"),
|
||||
headers={"Content-Type": "application/json"}
|
||||
|
@ -542,9 +579,9 @@ class Cluster(object):
|
|||
RETURN LIST OF {"alias":a, "index":i} PAIRS
|
||||
ALL INDEXES INCLUDED, EVEN IF NO ALIAS {"alias":Null}
|
||||
"""
|
||||
data = self.get_metadata().indices
|
||||
data = self.get("/_cluster/state")
|
||||
output = []
|
||||
for index, desc in data.items():
|
||||
for index, desc in data.metadata.indices.items():
|
||||
if not desc["aliases"]:
|
||||
output.append({"index": index, "alias": None})
|
||||
else:
|
||||
|
@ -552,29 +589,38 @@ class Cluster(object):
|
|||
output.append({"index": index, "alias": a})
|
||||
return wrap(output)
|
||||
|
||||
def get_metadata(self):
|
||||
def get_metadata(self, index=None, force=False):
|
||||
with self.metadata_locker:
|
||||
if self.settings.explore_metadata:
|
||||
if not self.cluster_metadata:
|
||||
if not self._metadata or (force and index is None):
|
||||
response = self.get("/_cluster/state")
|
||||
self.cluster_metadata = wrap(response.metadata)
|
||||
self._metadata = wrap(response.metadata)
|
||||
self.cluster_state = wrap(self.get("/"))
|
||||
self.version = self.cluster_state.version.number
|
||||
elif index: # UPDATE THE MAPPING FOR ONE INDEX ONLY
|
||||
response = self.get("/"+index+"/_mapping")
|
||||
self._metadata.indices[index].mappings = qb.sort(response.items(), 0).last()[1].mappings
|
||||
return Dict(indices={index: self._metadata.indices[index]})
|
||||
else:
|
||||
Log.error("Metadata exploration has been disabled")
|
||||
return self.cluster_metadata
|
||||
return self._metadata
|
||||
|
||||
|
||||
def _post(self, path, **kwargs):
|
||||
def post(self, path, **kwargs):
|
||||
url = self.settings.host + ":" + unicode(self.settings.port) + path
|
||||
|
||||
try:
|
||||
wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate"
|
||||
|
||||
if "data" in kwargs and not isinstance(kwargs["data"], str):
|
||||
data = kwargs.get(b'data')
|
||||
if data == None:
|
||||
pass
|
||||
elif isinstance(data, Mapping):
|
||||
kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data))
|
||||
elif not isinstance(kwargs["data"], str):
|
||||
Log.error("data must be utf8 encoded string")
|
||||
|
||||
if self.debug:
|
||||
sample = kwargs.get("data", "")[:300]
|
||||
sample = kwargs.get(b'data', "")[:300]
|
||||
Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample)
|
||||
|
||||
response = http.post(url, **kwargs)
|
||||
|
@ -597,9 +643,12 @@ class Cluster(object):
|
|||
suggestion = ""
|
||||
|
||||
if kwargs.get("data"):
|
||||
Log.error("Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
|
||||
Log.error(
|
||||
"Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
|
||||
url=url,
|
||||
body=kwargs["data"][0:10000] if self.debug else kwargs["data"][0:100], cause=e)
|
||||
body=kwargs["data"][0:10000] if self.debug else kwargs["data"][0:100],
|
||||
cause=e
|
||||
)
|
||||
else:
|
||||
Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
|
||||
|
||||
|
@ -718,7 +767,7 @@ def _scrub(r):
|
|||
if len(output) == 1:
|
||||
return output[0]
|
||||
try:
|
||||
return sort(output) # SUCCESS ONLY ON STRINGS, OR NUMBERS
|
||||
return sort(output)
|
||||
except Exception:
|
||||
return output
|
||||
else:
|
||||
|
@ -728,7 +777,7 @@ def _scrub(r):
|
|||
|
||||
|
||||
|
||||
class Alias(object):
|
||||
class Alias(Features):
|
||||
@use_settings
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -751,11 +800,13 @@ class Alias(object):
|
|||
Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.")
|
||||
|
||||
indices = self.cluster.get_metadata().indices
|
||||
if not self.settings.alias or self.settings.alias == self.settings.index:
|
||||
candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
|
||||
index = qb.sort(candidates, 0).last()[1]
|
||||
if not self.settings.alias or self.settings.alias==self.settings.index:
|
||||
alias_list = self.cluster.get("/_alias/"+self.settings.index)
|
||||
candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()]
|
||||
full_name = qb.sort(candidates, 0).last()[0]
|
||||
index = self.cluster.get("/" + full_name + "/_mapping")[full_name]
|
||||
else:
|
||||
index = indices[self.settings.index]
|
||||
index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]
|
||||
|
||||
# FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
|
||||
max_prop = -1
|
||||
|
@ -773,7 +824,7 @@ class Alias(object):
|
|||
|
||||
@property
|
||||
def url(self):
|
||||
return self.cluster.path + "/" + self.path
|
||||
return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/")
|
||||
|
||||
def get_schema(self, retry=True):
|
||||
if self.settings.explore_metadata:
|
||||
|
@ -871,8 +922,8 @@ class Alias(object):
|
|||
show_query.facets = {k: "..." for k in query.facets.keys()}
|
||||
else:
|
||||
show_query = query
|
||||
Log.note("Query:\n{{query|indent}}", query= show_query)
|
||||
return self.cluster._post(
|
||||
Log.note("Query:\n{{query|indent}}", query=show_query)
|
||||
return self.cluster.post(
|
||||
self.path + "/_search",
|
||||
data=convert.value2json(query).encode("utf8"),
|
||||
timeout=coalesce(timeout, self.settings.timeout)
|
||||
|
@ -886,6 +937,98 @@ class Alias(object):
|
|||
)
|
||||
|
||||
|
||||
def parse_properties(parent_index_name, parent_query_path, esProperties):
|
||||
"""
|
||||
RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
|
||||
"""
|
||||
from pyLibrary.queries.meta import Column
|
||||
|
||||
columns = DictList()
|
||||
for name, property in esProperties.items():
|
||||
if parent_query_path:
|
||||
index_name, query_path = parent_index_name, join_field(split_field(parent_query_path) + [name])
|
||||
else:
|
||||
index_name, query_path = parent_index_name, name
|
||||
|
||||
if property.type == "nested" and property.properties:
|
||||
# NESTED TYPE IS A NEW TYPE DEFINITION
|
||||
# MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH
|
||||
self_columns = parse_properties(index_name, query_path, property.properties)
|
||||
for c in self_columns:
|
||||
if not c.nested_path:
|
||||
c.nested_path = [query_path]
|
||||
else:
|
||||
c.nested_path.insert(0, query_path)
|
||||
columns.extend(self_columns)
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path,
|
||||
abs_name=query_path,
|
||||
type="nested",
|
||||
nested_path=[name]
|
||||
))
|
||||
|
||||
continue
|
||||
|
||||
if property.properties:
|
||||
child_columns = parse_properties(index_name, query_path, property.properties)
|
||||
columns.extend(child_columns)
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path,
|
||||
abs_name=query_path,
|
||||
type="object"
|
||||
))
|
||||
|
||||
if property.dynamic:
|
||||
continue
|
||||
if not property.type:
|
||||
continue
|
||||
if property.type == "multi_field":
|
||||
property.type = property.fields[name].type # PULL DEFAULT TYPE
|
||||
for i, (n, p) in enumerate(property.fields.items()):
|
||||
if n == name:
|
||||
# DEFAULT
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path,
|
||||
type=p.type
|
||||
))
|
||||
else:
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path + "." + n,
|
||||
type=p.type
|
||||
))
|
||||
continue
|
||||
|
||||
if property.type in ["string", "boolean", "integer", "date", "long", "double"]:
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path,
|
||||
abs_name=query_path,
|
||||
type=property.type
|
||||
))
|
||||
if property.index_name and name != property.index_name:
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=property.index_name,
|
||||
type=property.type
|
||||
))
|
||||
elif property.enabled == None or property.enabled == False:
|
||||
columns.append(Column(
|
||||
table=index_name,
|
||||
name=query_path,
|
||||
abs_name=query_path,
|
||||
type="object"
|
||||
))
|
||||
else:
|
||||
Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path)
|
||||
|
||||
return columns
|
||||
|
||||
|
||||
|
||||
def _merge_mapping(a, b):
|
||||
"""
|
||||
MERGE TWO MAPPINGS, a TAKES PRECEDENCE
|
||||
|
@ -990,5 +1133,3 @@ _merge_type = {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -16,7 +16,8 @@ import shutil
|
|||
|
||||
from pyLibrary.strings import utf82unicode
|
||||
from pyLibrary.maths import crypto
|
||||
from pyLibrary.dot import coalesce
|
||||
from pyLibrary.dot import coalesce, set_default, split_field, join_field
|
||||
from pyLibrary.dot import listwrap, wrap
|
||||
from pyLibrary import convert
|
||||
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ def request(method, url, **kwargs):
|
|||
if " Read timed out." in e:
|
||||
Log.error("Timeout failure (timeout was {{timeout}}", timeout=timeout, cause=e)
|
||||
else:
|
||||
Log.error("Request failure", e)
|
||||
Log.error("Request failure of {{url}}", url=url, cause=e)
|
||||
|
||||
|
||||
def _to_ascii_dict(headers):
|
||||
|
|
|
@ -5,7 +5,7 @@ import json
|
|||
import re
|
||||
from types import NoneType
|
||||
|
||||
from pyLibrary.dot import DictList, NullType
|
||||
from pyLibrary.dot import DictList, NullType, Dict, unwrap
|
||||
from pyLibrary.dot.objects import DictObject
|
||||
from pyLibrary.times.dates import Date
|
||||
|
||||
|
@ -22,10 +22,10 @@ def _late_import():
|
|||
global datetime2unix
|
||||
global utf82unicode
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.debugs.logs import Log as _Log
|
||||
from pyLibrary.convert import datetime2unix, utf82unicode
|
||||
|
||||
_ = Log
|
||||
_ = _Log
|
||||
_ = datetime2unix
|
||||
_ = utf82unicode
|
||||
|
||||
|
@ -50,7 +50,6 @@ def replace(match):
|
|||
|
||||
|
||||
def quote(value):
|
||||
value
|
||||
return "\"" + ESCAPE.sub(replace, value) + "\""
|
||||
|
||||
|
||||
|
@ -82,6 +81,8 @@ def _scrub(value, is_done):
|
|||
return utf82unicode(value)
|
||||
elif type_ is Decimal:
|
||||
return float(value)
|
||||
elif type_ is Dict:
|
||||
return _scrub(unwrap(value), is_done)
|
||||
elif isinstance(value, Mapping):
|
||||
_id = id(value)
|
||||
if _id in is_done:
|
||||
|
|
|
@ -67,11 +67,14 @@ except Exception, e:
|
|||
|
||||
append = UnicodeBuilder.append
|
||||
|
||||
_dealing_with_problem = False
|
||||
|
||||
|
||||
def pypy_json_encode(value, pretty=False):
|
||||
"""
|
||||
pypy DOES NOT OPTIMIZE GENERATOR CODE WELL
|
||||
"""
|
||||
global _dealing_with_problem
|
||||
if pretty:
|
||||
return pretty_json(value)
|
||||
|
||||
|
@ -83,14 +86,23 @@ def pypy_json_encode(value, pretty=False):
|
|||
except Exception, e:
|
||||
# THE PRETTY JSON WILL PROVIDE MORE DETAIL ABOUT THE SERIALIZATION CONCERNS
|
||||
from pyLibrary.debugs.logs import Log
|
||||
|
||||
if _dealing_with_problem:
|
||||
Log.error("Serialization of JSON problems", e)
|
||||
else:
|
||||
Log.warning("Serialization of JSON problems", e)
|
||||
_dealing_with_problem = True
|
||||
try:
|
||||
return pretty_json(value)
|
||||
except Exception, f:
|
||||
Log.error("problem serializing object", f)
|
||||
finally:
|
||||
_dealing_with_problem = False
|
||||
|
||||
|
||||
almost_pattern = r"(?:\.(\d*)999)|(?:\.(\d*)000)"
|
||||
|
||||
|
||||
def float_repr(value):
|
||||
output = repr(value)
|
||||
d = output.find(".")
|
||||
|
@ -107,13 +119,14 @@ def float_repr(value):
|
|||
else:
|
||||
return output
|
||||
|
||||
|
||||
json_encoder_module.FLOAT_REPR = float_repr
|
||||
|
||||
|
||||
class cPythonJSONEncoder(object):
|
||||
def __init__(self):
|
||||
object.__init__(self)
|
||||
|
||||
|
||||
self.encoder = json.JSONEncoder(
|
||||
skipkeys=False,
|
||||
ensure_ascii=False, # DIFF FROM DEFAULTS
|
||||
|
@ -135,6 +148,7 @@ class cPythonJSONEncoder(object):
|
|||
return unicode(self.encoder.encode(scrubbed))
|
||||
except Exception, e:
|
||||
from pyLibrary.debugs.logs import Log, Except
|
||||
|
||||
e = Except.wrap(e)
|
||||
Log.warning("problem serializing {{type}}", type=_repr(value), cause=e)
|
||||
raise e
|
||||
|
@ -242,7 +256,6 @@ def _dict2json(value, _buffer):
|
|||
append(_buffer, u"}")
|
||||
|
||||
|
||||
|
||||
ARRAY_ROW_LENGTH = 80
|
||||
ARRAY_ITEM_MAX_LENGTH = 30
|
||||
ARRAY_MAX_COLUMNS = 10
|
||||
|
@ -262,7 +275,7 @@ def pretty_json(value):
|
|||
from pyLibrary.debugs.logs import Log
|
||||
|
||||
try:
|
||||
Log.note("try explicit convert of string with length {{length}}", length= len(value))
|
||||
Log.note("try explicit convert of string with length {{length}}", length=len(value))
|
||||
acc = [u"\""]
|
||||
for c in value:
|
||||
try:
|
||||
|
@ -277,7 +290,7 @@ def pretty_json(value):
|
|||
# Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g)
|
||||
acc.append(u"\"")
|
||||
output = u"".join(acc)
|
||||
Log.note("return value of length {{length}}", length= len(output))
|
||||
Log.note("return value of length {{length}}", length=len(output))
|
||||
return output
|
||||
except BaseException, f:
|
||||
Log.warning("can not even explicit convert", f)
|
||||
|
@ -291,8 +304,8 @@ def pretty_json(value):
|
|||
return "{" + quote(unicode(items[0][0])) + ": " + pretty_json(items[0][1]).strip() + "}"
|
||||
|
||||
items = sorted(items, lambda a, b: value_compare(a[0], b[0]))
|
||||
values = [quote(unicode(k))+": " + indent(pretty_json(v)).strip() for k, v in items if v != None]
|
||||
return "{\n" + INDENT + (",\n"+INDENT).join(values) + "\n}"
|
||||
values = [quote(unicode(k)) + ": " + indent(pretty_json(v)).strip() for k, v in items if v != None]
|
||||
return "{\n" + INDENT + (",\n" + INDENT).join(values) + "\n}"
|
||||
except Exception, e:
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.collections import OR
|
||||
|
@ -309,7 +322,7 @@ def pretty_json(value):
|
|||
if not value:
|
||||
return "[]"
|
||||
|
||||
if ARRAY_MAX_COLUMNS==1:
|
||||
if ARRAY_MAX_COLUMNS == 1:
|
||||
return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"
|
||||
|
||||
if len(value) == 1:
|
||||
|
@ -323,14 +336,14 @@ def pretty_json(value):
|
|||
max_len = max(*[len(j) for j in js])
|
||||
if max_len <= ARRAY_ITEM_MAX_LENGTH and max(*[j.find("\n") for j in js]) == -1:
|
||||
# ALL TINY VALUES
|
||||
num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0)/float(max_len+2))))) # +2 TO COMPENSATE FOR COMMAS
|
||||
if len(js)<=num_columns: # DO NOT ADD \n IF ONLY ONE ROW
|
||||
num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS
|
||||
if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW
|
||||
return "[" + ", ".join(js) + "]"
|
||||
if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN
|
||||
return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"
|
||||
|
||||
content = ",\n".join(
|
||||
", ".join(j.rjust(max_len) for j in js[r:r+num_columns])
|
||||
", ".join(j.rjust(max_len) for j in js[r:r + num_columns])
|
||||
for r in xrange(0, len(js), num_columns)
|
||||
)
|
||||
return "[\n" + indent(content) + "\n]"
|
||||
|
@ -363,13 +376,13 @@ def pretty_json(value):
|
|||
return "null"
|
||||
else:
|
||||
try:
|
||||
if int(value)==value:
|
||||
if int(value) == value:
|
||||
return str(int(value))
|
||||
except Exception, e:
|
||||
pass
|
||||
|
||||
try:
|
||||
if float(value)==value:
|
||||
if float(value) == value:
|
||||
return str(float(value))
|
||||
except Exception, e:
|
||||
pass
|
||||
|
@ -450,13 +463,11 @@ def datetime2milli(d, type):
|
|||
_repr_ = Repr()
|
||||
_repr_.maxlevel = 2
|
||||
|
||||
|
||||
def _repr(obj):
|
||||
return _repr_.repr(obj)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# OH HUM, cPython with uJSON, OR pypy WITH BUILTIN JSON?
|
||||
# http://liangnuren.wordpress.com/2012/08/13/python-json-performance/
|
||||
# http://morepypy.blogspot.ca/2011/10/speeding-up-json-encoding-in-pypy.html
|
||||
|
|
|
@ -207,7 +207,7 @@ def get_file(ref, url):
|
|||
except Exception, e:
|
||||
try:
|
||||
new_value = _convert.ini2value(content)
|
||||
except Exception, f:
|
||||
except Exception:
|
||||
raise _Log.error("Can not read {{file}}", file=path, cause=e)
|
||||
new_value = _replace_ref(new_value, ref)
|
||||
return new_value
|
||||
|
|
|
@ -218,7 +218,13 @@ def json2typed(json):
|
|||
mode = VALUE
|
||||
elif c == ",":
|
||||
mode = context.pop()
|
||||
elif c in "]}":
|
||||
if mode != OBJECT:
|
||||
context.append(mode)
|
||||
mode = VALUE
|
||||
elif c in "]":
|
||||
mode = context.pop()
|
||||
elif c in "}":
|
||||
mode = context.pop()
|
||||
mode = context.pop()
|
||||
elif c == '"':
|
||||
context.append(mode)
|
||||
|
@ -276,6 +282,8 @@ def json2typed(json):
|
|||
context.append(mode)
|
||||
context.append(KEYWORD)
|
||||
mode = STRING
|
||||
elif c == ",":
|
||||
pass
|
||||
elif c == '}':
|
||||
mode = context.pop()
|
||||
else:
|
||||
|
|
|
@ -234,7 +234,7 @@ class Math(object):
|
|||
|
||||
@staticmethod
|
||||
def MAX(values):
|
||||
output = None
|
||||
output = Null
|
||||
for v in values:
|
||||
if v == None:
|
||||
continue
|
||||
|
|
|
@ -124,14 +124,14 @@ def use_settings(func):
|
|||
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
if func.func_name == "__init__" and "settings" in kwargs:
|
||||
if func.func_name in ("__init__", "__new__") and "settings" in kwargs:
|
||||
packed = params_pack(params, kwargs, dot.zip(params[1:], args[1:]), kwargs["settings"], defaults)
|
||||
return func(args[0], **packed)
|
||||
elif func.func_name == "__init__" and len(args) == 2 and len(kwargs) == 0 and isinstance(args[1], Mapping):
|
||||
elif func.func_name in ("__init__", "__new__") and len(args) == 2 and len(kwargs) == 0 and isinstance(args[1], Mapping):
|
||||
# ASSUME SECOND UNNAMED PARAM IS settings
|
||||
packed = params_pack(params, args[1], defaults)
|
||||
return func(args[0], **packed)
|
||||
elif func.func_name == "__init__":
|
||||
elif func.func_name in ("__init__", "__new__"):
|
||||
# DO NOT INCLUDE self IN SETTINGS
|
||||
packed = params_pack(params, kwargs, dot.zip(params[1:], args[1:]), defaults)
|
||||
return func(args[0], **packed)
|
||||
|
|
|
@ -1,19 +1,18 @@
|
|||
from urlparse import urlparse
|
||||
|
||||
from pyLibrary.dot import wrap
|
||||
from urlparse import urlparse, parse_qs
|
||||
from pyLibrary.dot import Null, coalesce, wrap
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
|
||||
|
||||
_convert = None
|
||||
_Log = None
|
||||
convert = None
|
||||
Log = None
|
||||
|
||||
|
||||
def _late_import():
|
||||
global _convert
|
||||
global _Log
|
||||
from pyLibrary import convert as _convert
|
||||
from pyLibrary.debugs.logs import Log as _Log
|
||||
_ = _convert
|
||||
_ = _Log
|
||||
global convert
|
||||
global Log
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
|
||||
|
||||
names = ["path", "query", "fragment"]
|
||||
indicator = ["/", "?", "#"]
|
||||
|
@ -50,7 +49,7 @@ class URL(object):
|
|||
if value == None:
|
||||
return
|
||||
|
||||
if not _convert:
|
||||
if not convert:
|
||||
_late_import()
|
||||
if value.startswith("file://") or value.startswith("//"):
|
||||
# urlparse DOES NOT WORK IN THESE CASES
|
||||
|
@ -58,7 +57,7 @@ class URL(object):
|
|||
self.scheme = scheme.rstrip(":")
|
||||
parse(self, suffix, 0, 1)
|
||||
|
||||
self.query = wrap(_convert.url_param2value(self.query))
|
||||
self.query = wrap(convert.url_param2value(self.query))
|
||||
self.fragment = self.fragment
|
||||
else:
|
||||
output = urlparse(value)
|
||||
|
@ -66,7 +65,7 @@ class URL(object):
|
|||
self.port = output.port
|
||||
self.host = output.netloc.split(":")[0]
|
||||
self.path = output.path
|
||||
self.query = wrap(_convert.url_param2value(output.query))
|
||||
self.query = wrap(convert.url_param2value(output.query))
|
||||
self.fragment = output.fragment
|
||||
|
||||
def __nonzero__(self):
|
||||
|
@ -90,9 +89,9 @@ class URL(object):
|
|||
if self.path:
|
||||
url += str(self.path)
|
||||
if self.query:
|
||||
url = url + '?' + _convert.value2url(self.query)
|
||||
url = url + '?' + convert.value2url(self.query)
|
||||
if self.fragment:
|
||||
url = url + '#' + _convert.value2url(self.fragment)
|
||||
url = url + '#' + convert.value2url(self.fragment)
|
||||
return url
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
MoQueries
|
||||
=========
|
||||
|
||||
A Python library that supports [Qb queries](https://github.com/klahnakoski/ActiveData/blob/dev/docs/Qb_Tutorial.md "Qb Queries"), and a variety of other set-operations.
|
||||
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import wrap, set_default, split_field
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.queries import containers
|
||||
|
||||
type2container = Dict()
|
||||
config = Dict() # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE
|
||||
_ListContainer = None
|
||||
|
||||
def _delayed_imports():
|
||||
global type2container
|
||||
global _ListContainer
|
||||
|
||||
from pyLibrary.queries.containers.lists import ListContainer as _ListContainer
|
||||
_ = _ListContainer
|
||||
|
||||
from pyLibrary.queries.qb_usingMySQL import MySQL
|
||||
from pyLibrary.queries.qb_usingES import FromES
|
||||
from pyLibrary.queries.meta import FromESMetadata
|
||||
|
||||
set_default(type2container, {
|
||||
"elasticsearch": FromES,
|
||||
"mysql": MySQL,
|
||||
"memory": None,
|
||||
"meta": FromESMetadata
|
||||
})
|
||||
|
||||
|
||||
def wrap_from(frum, schema=None):
|
||||
"""
|
||||
:param frum:
|
||||
:param schema:
|
||||
:return:
|
||||
"""
|
||||
if not type2container:
|
||||
_delayed_imports()
|
||||
|
||||
frum = wrap(frum)
|
||||
|
||||
if isinstance(frum, basestring):
|
||||
if not containers.config.default.settings:
|
||||
Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info")
|
||||
|
||||
type_ = None
|
||||
index = frum
|
||||
if frum.startswith("meta."):
|
||||
type_ = "meta"
|
||||
else:
|
||||
index = split_field(frum)[0]
|
||||
|
||||
settings = set_default(
|
||||
{
|
||||
"index": index,
|
||||
"name": frum,
|
||||
"type": type_
|
||||
},
|
||||
containers.config.default.settings
|
||||
)
|
||||
return type2container[settings.type](settings)
|
||||
elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]:
|
||||
# TODO: Ensure the frum.name is set, so we capture the deep queries
|
||||
if not frum.type:
|
||||
Log.error("Expecting from clause to have a 'type' property")
|
||||
return type2container[frum.type](frum.settings)
|
||||
elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))):
|
||||
from pyLibrary.queries.query import Query
|
||||
return Query(frum, schema=schema)
|
||||
elif isinstance(frum, list):
|
||||
return _ListContainer(frum)
|
||||
else:
|
||||
return frum
|
||||
|
||||
|
||||
|
||||
import es09.util
|
|
@ -16,7 +16,7 @@ from copy import copy
|
|||
from types import GeneratorType
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import set_default, split_field, wrap
|
||||
from pyLibrary.dot import set_default, split_field, wrap, DictList
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
|
||||
type2container = Dict()
|
||||
|
@ -25,7 +25,7 @@ _ListContainer = None
|
|||
_Cube = None
|
||||
_run = None
|
||||
_Query = None
|
||||
|
||||
_Normal = None
|
||||
|
||||
def _delayed_imports():
|
||||
global type2container
|
||||
|
@ -33,6 +33,7 @@ def _delayed_imports():
|
|||
global _Cube
|
||||
global _run
|
||||
global _Query
|
||||
global _Normal
|
||||
|
||||
from pyLibrary.queries.qb_usingMySQL import MySQL as _MySQL
|
||||
from pyLibrary.queries.qb_usingES import FromES as _FromES
|
||||
|
@ -49,10 +50,11 @@ def _delayed_imports():
|
|||
|
||||
_ = _run
|
||||
_ = _Query
|
||||
_ = _Normal
|
||||
|
||||
|
||||
class Container(object):
|
||||
__slots__ = ["data", "schema"]
|
||||
__slots__ = ["data", "schema", "namespaces"]
|
||||
|
||||
@classmethod
|
||||
def new_instance(type, frum, schema=None):
|
||||
|
@ -100,8 +102,14 @@ class Container(object):
|
|||
|
||||
def __init__(self, frum, schema=None):
|
||||
object.__init__(self)
|
||||
if not type2container:
|
||||
_delayed_imports()
|
||||
|
||||
self.data = frum
|
||||
if isinstance(schema, list):
|
||||
Log.error("expecting map from abs_name to column object")
|
||||
self.schema = schema
|
||||
# self.namespaces = wrap([_Normal()])
|
||||
|
||||
def query(self, query):
|
||||
if query.frum != self:
|
||||
|
@ -135,7 +143,7 @@ class Container(object):
|
|||
_ = format
|
||||
Log.error("not implemented")
|
||||
|
||||
def get_columns(self, frum):
|
||||
def get_columns(self, table):
|
||||
"""
|
||||
USE THE frum TO DETERMINE THE COLUMNS
|
||||
"""
|
||||
|
|
|
@ -16,11 +16,11 @@ from pyLibrary import convert
|
|||
from pyLibrary.collections.matrix import Matrix
|
||||
from pyLibrary.collections import MAX, OR
|
||||
from pyLibrary.queries.containers import Container
|
||||
# from pyLibrary.queries.query import _normalize_edge
|
||||
from pyLibrary.dot import Null, Dict
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, wrap_dot, listwrap
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.queries.query import _normalize_edge
|
||||
|
||||
|
||||
class Cube(Container):
|
||||
|
@ -272,7 +272,7 @@ class Cube(Container):
|
|||
if len(self.edges)==1 and self.edges[0].domain.type=="index":
|
||||
# USE THE STANDARD LIST FILTER
|
||||
from pyLibrary.queries import qb
|
||||
return qb.filter(where, self.data.values()[0].cube)
|
||||
return qb.filter(self.data.values()[0].cube, where)
|
||||
else:
|
||||
# FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS
|
||||
Log.unexpected("Incomplete")
|
||||
|
|
|
@ -10,26 +10,37 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from collections import Mapping
|
||||
from pyLibrary import convert
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict, wrap
|
||||
from pyLibrary.dot import Dict, wrap, listwrap, unwraplist
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER
|
||||
from pyLibrary.queries.list.aggs import is_aggs, list_aggs
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER, qb_expression_to_python
|
||||
from pyLibrary.queries.lists.aggs import is_aggs, list_aggs
|
||||
from pyLibrary.queries.meta import Column
|
||||
from pyLibrary.thread.threads import Lock
|
||||
|
||||
|
||||
class ListContainer(Container):
|
||||
def __init__(self, frum, schema=None):
|
||||
Container.__init__(self, frum, schema)
|
||||
self.frum = list(frum)
|
||||
#TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION
|
||||
frum = list(frum)
|
||||
if schema == None:
|
||||
self.schema = get_schema_from_list(frum)
|
||||
Container.__init__(self, frum, schema)
|
||||
self.frum = frum
|
||||
|
||||
@property
|
||||
def query_path(self):
|
||||
return None
|
||||
|
||||
def query(self, q):
|
||||
frum = self.frum
|
||||
frum = self
|
||||
if is_aggs(q):
|
||||
frum = list_aggs(frum, q)
|
||||
frum = list_aggs(frum.data, q)
|
||||
else: # SETOP
|
||||
try:
|
||||
if q.filter != None or q.esfilter != None:
|
||||
|
@ -51,22 +62,55 @@ class ListContainer(Container):
|
|||
|
||||
return frum.format(q.format)
|
||||
|
||||
def update(self, command):
|
||||
"""
|
||||
EXPECTING command == {"set":term, "clear":term, "where":where}
|
||||
THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
|
||||
THE where CLAUSE IS AN ES FILTER
|
||||
"""
|
||||
command = wrap(command)
|
||||
if command.where==None:
|
||||
filter_ = lambda: True
|
||||
else:
|
||||
filter_ = _exec("temp = lambda row: "+qb_expression_to_python(command.where))
|
||||
|
||||
|
||||
for c in self.data:
|
||||
if filter_(c):
|
||||
for k in command["clear"].keys():
|
||||
c[k] = None
|
||||
for k, v in command.set.items():
|
||||
c[k] = v
|
||||
|
||||
def filter(self, where):
|
||||
return self.where(where)
|
||||
|
||||
def where(self, where):
|
||||
_ = where
|
||||
Log.error("not implemented")
|
||||
if isinstance(where, Mapping):
|
||||
temp = None
|
||||
exec("def temp(row):\n return "+qb_expression_to_python(where))
|
||||
else:
|
||||
temp = where
|
||||
|
||||
return ListContainer(filter(temp, self.data), self.schema)
|
||||
|
||||
def sort(self, sort):
|
||||
_ = sort
|
||||
Log.error("not implemented")
|
||||
return ListContainer(qb.sort(self.data, sort), self.schema)
|
||||
|
||||
def select(self, select):
|
||||
_ = select
|
||||
Log.error("not implemented")
|
||||
selects = listwrap(select)
|
||||
if selects[0].value == "*" and selects[0].name == ".":
|
||||
return self
|
||||
|
||||
for s in selects:
|
||||
if not isinstance(s.value, basestring) or not is_keyword(s.value):
|
||||
Log.error("selecting on structure, or expressions, not supported yet")
|
||||
|
||||
#TODO: DO THIS WITH JUST A SCHEMA TRANSFORM, DO NOT TOUCH DATA
|
||||
#TODO: HANDLE STRUCTURE AND EXPRESSIONS
|
||||
new_schema = {s.name: self.schema[s.value] for s in selects}
|
||||
new_data = [{s.name: d[s.value] for s in selects} for d in self.data]
|
||||
return ListContainer(frum=new_data, schema=new_schema)
|
||||
|
||||
def window(self, window):
|
||||
_ = window
|
||||
|
@ -78,17 +122,35 @@ class ListContainer(Container):
|
|||
|
||||
def format(self, format):
|
||||
if format == "table":
|
||||
frum = convert.list2table(self.data)
|
||||
frum.meta.format = "table"
|
||||
frum = convert.list2table(self.data, self.schema.keys())
|
||||
elif format == "cube":
|
||||
frum = convert.list2cube(self.data, self.schema.keys())
|
||||
else:
|
||||
frum = wrap({
|
||||
frum = self
|
||||
|
||||
return frum
|
||||
|
||||
def insert(self, documents):
|
||||
self.data.extend(documents)
|
||||
|
||||
def extend(self, documents):
|
||||
self.data.extend(documents)
|
||||
|
||||
def add(self, doc):
|
||||
self.data.append(doc)
|
||||
|
||||
def to_dict(self):
|
||||
return wrap({
|
||||
"meta": {"format": "list"},
|
||||
"data": self.data
|
||||
"data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data]
|
||||
})
|
||||
|
||||
def get_columns(self, frum):
|
||||
def get_columns(self, table=None):
|
||||
return self.schema.values()
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.data[item]
|
||||
|
||||
|
||||
def get_schema_from_list(frum):
|
||||
"""
|
||||
|
@ -98,7 +160,7 @@ def get_schema_from_list(frum):
|
|||
_get_schema_from_list(frum, columns, [], 0)
|
||||
return columns
|
||||
|
||||
def _get_schema_from_list(frum, columns, prefix, depth):
|
||||
def _get_schema_from_list(frum, columns, prefix, nested_path):
|
||||
"""
|
||||
SCAN THE LIST FOR COLUMN TYPES
|
||||
"""
|
||||
|
@ -111,14 +173,21 @@ def _get_schema_from_list(frum, columns, prefix, depth):
|
|||
names[name] = new_type
|
||||
|
||||
if this_type == "object":
|
||||
_get_schema_from_list([value], columns, prefix + [name], depth)
|
||||
_get_schema_from_list([value], columns, prefix + [name], nested_path)
|
||||
elif this_type == "nested":
|
||||
_get_schema_from_list(value, columns, prefix + [name], depth+1)
|
||||
if not nested_path:
|
||||
_get_schema_from_list(value, columns, prefix + [name], [name])
|
||||
else:
|
||||
_get_schema_from_list(value, columns, prefix + [name], [nested_path[0]+"."+name]+nested_path)
|
||||
|
||||
for n, t in names.items():
|
||||
full_name = ".".join(prefix + [n])
|
||||
column = {"name": full_name, "value": full_name, "type": t, "depth": depth}
|
||||
columns[full_name] = column
|
||||
column = Column(
|
||||
name=full_name,
|
||||
type=t,
|
||||
nested_path=nested_path
|
||||
)
|
||||
columns[columns.name] = column
|
||||
|
||||
|
||||
_type_to_name = {
|
||||
|
@ -229,3 +298,7 @@ _merge_type = {
|
|||
|
||||
|
||||
|
||||
def _exec(code):
|
||||
temp = None
|
||||
exec code
|
||||
return temp
|
||||
|
|
|
@ -14,7 +14,7 @@ import itertools
|
|||
|
||||
from pyLibrary.collections.matrix import Matrix
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import listwrap
|
||||
from pyLibrary.dot import listwrap, unwrap
|
||||
from pyLibrary.queries import windows
|
||||
from pyLibrary.queries.containers.cube import Cube
|
||||
from pyLibrary.queries.domains import SimpleSetDomain, DefaultDomain
|
||||
|
|
|
@ -26,16 +26,20 @@ DEFAULT_QUERY_LIMIT = 20
|
|||
|
||||
|
||||
class Dimension(Container):
|
||||
__slots__ = ["name", "full_name", "where", "type", "limit", "index", "parent", "edges", "partitions", "fields"]
|
||||
|
||||
def __init__(self, dim, parent, qb):
|
||||
dim = wrap(dim)
|
||||
|
||||
self.name = dim.name
|
||||
self.parent = parent
|
||||
self.parent = coalesce(parent)
|
||||
self.full_name = join_field(split_field(self.parent.full_name)+[self.name])
|
||||
self.edges = None # FOR NOW
|
||||
dot.set_default(self, dim)
|
||||
self.esfilter = dim.esfilter
|
||||
self.where = dim.where
|
||||
self.type = coalesce(dim.type, "set")
|
||||
self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT)
|
||||
self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name)
|
||||
self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.settings.index)
|
||||
|
||||
if not self.index:
|
||||
Log.error("Expecting an index name")
|
||||
|
@ -61,18 +65,19 @@ class Dimension(Container):
|
|||
|
||||
if dim.partitions:
|
||||
return # ALREADY HAVE PARTS
|
||||
if dim.type not in KNOWN - ALGEBRAIC:
|
||||
if self.type not in KNOWN - ALGEBRAIC:
|
||||
return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH
|
||||
|
||||
qb.get_columns()
|
||||
with Timer("Get parts of {{name}}", {"name": self.name}):
|
||||
parts = qb.query({
|
||||
"from": self.index,
|
||||
"select": {"name": "count", "aggregate": "count"},
|
||||
"edges": edges,
|
||||
"esfilter": self.esfilter,
|
||||
"where": self.where,
|
||||
"limit": self.limit
|
||||
})
|
||||
Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts))
|
||||
Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts))
|
||||
|
||||
d = parts.edges[0].domain
|
||||
|
||||
|
@ -101,7 +106,7 @@ class Dimension(Container):
|
|||
if p:
|
||||
partitions.append({
|
||||
"value": g,
|
||||
"esfilter": {"and": [
|
||||
"where": {"and": [
|
||||
{"term": {e.value: g[e.name]}}
|
||||
for e in edges
|
||||
]},
|
||||
|
@ -116,7 +121,7 @@ class Dimension(Container):
|
|||
{
|
||||
"name": str(d.partitions[i].name), # CONVERT TO STRING
|
||||
"value": d.getEnd(d.partitions[i]),
|
||||
"esfilter": {"term": {edges[0].value: d.partitions[i].value}},
|
||||
"where": {"term": {edges[0].value: d.partitions[i].value}},
|
||||
"count": count
|
||||
}
|
||||
for i, count in enumerate(parts)
|
||||
|
@ -142,13 +147,13 @@ class Dimension(Container):
|
|||
{
|
||||
"name": str(d.partitions[i].name), # CONVERT TO STRING
|
||||
"value": d.getEnd(d.partitions[i]),
|
||||
"esfilter": {"term": {edges[0].value: d.partitions[i].value}},
|
||||
"where": {"term": {edges[0].value: d.partitions[i].value}},
|
||||
"count": SUM(subcube),
|
||||
"partitions": [
|
||||
{
|
||||
"name": str(d2.partitions[j].name), # CONVERT TO STRING
|
||||
"value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])),
|
||||
"esfilter": {"and": [
|
||||
"where": {"and": [
|
||||
{"term": {edges[0].value: d.partitions[i].value}},
|
||||
{"term": {edges[1].value: d2.partitions[j].value}}
|
||||
]},
|
||||
|
@ -165,11 +170,17 @@ class Dimension(Container):
|
|||
|
||||
parse_partition(self) # RELATE THE PARTS TO THE PARENTS
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.__getattr__(item)
|
||||
|
||||
def __getattr__(self, key):
|
||||
"""
|
||||
RETURN CHILD EDGE OR PARTITION BY NAME
|
||||
"""
|
||||
#TODO: IGNORE THE STANDARD DIMENSION PROPERTIES TO AVOID ACCIDENTAL SELECTION OF EDGE OR PART
|
||||
if key in Dimension.__slots__:
|
||||
return None
|
||||
|
||||
e = self.edges[key]
|
||||
if e:
|
||||
return e
|
||||
|
@ -187,14 +198,14 @@ class Dimension(Container):
|
|||
# USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP
|
||||
partitions = [
|
||||
{
|
||||
"name":v.name,
|
||||
"value":v.name,
|
||||
"esfilter":v.esfilter,
|
||||
"style":v.style,
|
||||
"weight":v.weight # YO! WHAT DO WE *NOT* COPY?
|
||||
"name": v.name,
|
||||
"value": v.name,
|
||||
"where": v.where,
|
||||
"style": v.style,
|
||||
"weight": v.weight # YO! WHAT DO WE *NOT* COPY?
|
||||
}
|
||||
for i, v in enumerate(self.edges)
|
||||
if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.esfilter
|
||||
if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where
|
||||
]
|
||||
self.isFacet = True
|
||||
elif kwargs.depth == None: # ASSUME self.fields IS A dict
|
||||
|
@ -205,7 +216,7 @@ class Dimension(Container):
|
|||
partitions.append({
|
||||
"name":part.name,
|
||||
"value":part.value,
|
||||
"esfilter":part.esfilter,
|
||||
"where":part.where,
|
||||
"style":coalesce(part.style, part.parent.style),
|
||||
"weight":part.weight # YO! WHAT DO WE *NOT* COPY?
|
||||
})
|
||||
|
@ -214,7 +225,7 @@ class Dimension(Container):
|
|||
{
|
||||
"name":v.name,
|
||||
"value":v.value,
|
||||
"esfilter":v.esfilter,
|
||||
"where":v.where,
|
||||
"style":v.style,
|
||||
"weight":v.weight # YO! WHAT DO WE *NOT* COPY?
|
||||
}
|
||||
|
@ -232,7 +243,7 @@ class Dimension(Container):
|
|||
partitions.append({
|
||||
"name":join_field(split_field(subpart.parent.name) + [subpart.name]),
|
||||
"value":subpart.value,
|
||||
"esfilter":subpart.esfilter,
|
||||
"where":subpart.where,
|
||||
"style":coalesce(subpart.style, subpart.parent.style),
|
||||
"weight":subpart.weight # YO! WHAT DO WE *NOT* COPY?
|
||||
})
|
||||
|
@ -324,12 +335,12 @@ def parse_partition(part):
|
|||
p.value = coalesce(p.value, p.name)
|
||||
p.parent = part
|
||||
|
||||
if not part.esfilter:
|
||||
if not part.where:
|
||||
if len(part.partitions) > 100:
|
||||
Log.error("Must define an esfilter on {{name}} there are too many partitions ({{num_parts}})",
|
||||
Log.error("Must define an where on {{name}} there are too many partitions ({{num_parts}})",
|
||||
name= part.name,
|
||||
num_parts= len(part.partitions))
|
||||
|
||||
# DEFAULT esfilter IS THE UNION OF ALL CHILD FILTERS
|
||||
# DEFAULT where IS THE UNION OF ALL CHILD FILTERS
|
||||
if part.partitions:
|
||||
part.esfilter = {"or": part.partitions.esfilter}
|
||||
part.where = {"or": part.partitions.where}
|
||||
|
|
|
@ -14,16 +14,18 @@ from collections import Mapping
|
|||
from numbers import Number
|
||||
import re
|
||||
import itertools
|
||||
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.unique_index import UniqueIndex
|
||||
from pyLibrary.dot import coalesce, Dict, set_default, Null, listwrap
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, unwrap
|
||||
from pyLibrary.dot import wrap
|
||||
from pyLibrary.times.dates import Date
|
||||
from pyLibrary.times.durations import Duration
|
||||
|
||||
|
||||
ALGEBRAIC = {"time", "duration", "numeric", "count", "datetime"} # DOMAINS THAT HAVE ALGEBRAIC OPERATIONS DEFINED
|
||||
KNOWN = {"set", "boolean", "duration", "time", "numeric"} # DOMAINS THAT HAVE A KNOWN NUMBER FOR PARTS AT QUERY TIME
|
||||
PARTITION = {"uid", "set", "boolean"} # DIMENSIONS WITH CLEAR PARTS
|
||||
|
@ -132,6 +134,7 @@ class DefaultDomain(Domain):
|
|||
self.partitions = DictList()
|
||||
self.map = dict()
|
||||
self.map[None] = self.NULL
|
||||
self.limit = desc.get('limit')
|
||||
|
||||
def compare(self, a, b):
|
||||
return value_compare(a.value, b.value)
|
||||
|
@ -162,6 +165,7 @@ class DefaultDomain(Domain):
|
|||
def as_dict(self):
|
||||
output = Domain.as_dict(self)
|
||||
output.partitions = self.partitions
|
||||
output.limit = self.limit
|
||||
return output
|
||||
|
||||
|
||||
|
@ -284,6 +288,8 @@ class SimpleSetDomain(Domain):
|
|||
return self.partitions[index]
|
||||
|
||||
def getKeyByIndex(self, index):
|
||||
if index < 0 or index >= len(self.partitions):
|
||||
return None
|
||||
return self.partitions[index][self.key]
|
||||
|
||||
def getKey(self, part):
|
||||
|
@ -533,6 +539,70 @@ class DurationDomain(Domain):
|
|||
return output
|
||||
|
||||
|
||||
|
||||
class NumericDomain(Domain):
|
||||
__slots__ = ["max", "min"]
|
||||
|
||||
def __new__(cls, **desc):
|
||||
if not desc.get('partitions') and not desc.get('interval'):
|
||||
return object.__new__(cls)
|
||||
else:
|
||||
return object.__new__(RangeDomain)
|
||||
|
||||
def __init__(self, **desc):
|
||||
Domain.__init__(self, **desc)
|
||||
self.min = desc.get('min')
|
||||
self.max = desc.get('max')
|
||||
|
||||
def compare(self, a, b):
|
||||
return value_compare(a, b)
|
||||
|
||||
def getCanonicalPart(self, part):
|
||||
return part
|
||||
|
||||
def getIndexByKey(self, key):
|
||||
return key
|
||||
|
||||
def getPartByKey(self, key):
|
||||
if self.min!=None and key < self.min:
|
||||
return self.NULL
|
||||
if self.max!=None and key >= self.max:
|
||||
return self.NULL
|
||||
return key
|
||||
|
||||
def getKey(self, part):
|
||||
return part
|
||||
|
||||
def getKeyByIndex(self, index):
|
||||
return index
|
||||
|
||||
def as_dict(self):
|
||||
output = Domain.as_dict(self)
|
||||
|
||||
output.min = self.min
|
||||
output.max = self.max
|
||||
return output
|
||||
|
||||
|
||||
class UniqueDomain(Domain):
|
||||
__slots__ = ()
|
||||
|
||||
def compare(self, a, b):
|
||||
return value_compare(a, b)
|
||||
|
||||
def getCanonicalPart(self, part):
|
||||
return part
|
||||
|
||||
def getPartByKey(self, key):
|
||||
return key
|
||||
|
||||
def getKey(self, part):
|
||||
return part
|
||||
|
||||
def getEnd(self, value):
|
||||
return value
|
||||
|
||||
|
||||
class RangeDomain(Domain):
|
||||
__slots__ = ["max", "min", "interval", "partitions", "NULL"]
|
||||
|
||||
|
@ -640,9 +710,10 @@ name_to_type = {
|
|||
"value": ValueDomain,
|
||||
"default": DefaultDomain,
|
||||
"set": SimpleSetDomain,
|
||||
"uid": DefaultDomain,
|
||||
"time": TimeDomain,
|
||||
"duration": DurationDomain,
|
||||
"range": RangeDomain
|
||||
"range": NumericDomain,
|
||||
"uid": UniqueDomain,
|
||||
"numeric": NumericDomain
|
||||
}
|
||||
|
||||
|
|
|
@ -82,12 +82,8 @@ class _MVEL(object):
|
|||
path = split_field(fromPath)
|
||||
|
||||
# ADD LOCAL VARIABLES
|
||||
from pyLibrary.queries.es09.util import INDEX_CACHE
|
||||
|
||||
columns = INDEX_CACHE[path[0]].columns
|
||||
for i, c in enumerate(columns):
|
||||
if c.name=="attachments":
|
||||
Log.debug("")
|
||||
if c.name.find("\\.") >= 0:
|
||||
self.prefixMap.insert(0, {
|
||||
"path": c.name,
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
|
||||
from pyLibrary import convert
|
||||
|
@ -21,11 +20,9 @@ from pyLibrary.debugs.logs import Log
|
|||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import domains
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import split_field, join_field, coalesce
|
||||
from pyLibrary.dot import split_field, join_field, coalesce, Null
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.es09 import expressions
|
||||
from pyLibrary.queries.es09.expressions import value2MVEL, isKeyword
|
||||
from pyLibrary.queries.expressions import simplify_esfilter
|
||||
from pyLibrary.times import durations
|
||||
|
@ -34,21 +31,19 @@ from pyLibrary.times import durations
|
|||
TrueFilter = {"match_all": {}}
|
||||
DEBUG = False
|
||||
|
||||
INDEX_CACHE = {} # MATCH NAMES TO ES URL AND COLUMNS eg {name:{"url":url, "columns":columns"}}
|
||||
|
||||
|
||||
def post(es, FromES, limit):
|
||||
if not FromES.facets and FromES.size == 0 and not FromES.aggs:
|
||||
def post(es, es_query, limit):
|
||||
if not es_query.facets and es_query.size == 0 and not es_query.aggs:
|
||||
Log.error("FromES is sending no facets")
|
||||
# DO NOT KNOW WHY THIS WAS HERE
|
||||
# if isinstance(query.select, list) or len(query.edges) and not FromES.facets.keys and FromES.size == 0:
|
||||
# Log.error("FromES is sending no facets")
|
||||
|
||||
postResult = None
|
||||
post_result = None
|
||||
try:
|
||||
postResult = es.search(FromES)
|
||||
post_result = es.search(es_query)
|
||||
|
||||
for facetName, f in postResult.facets.items():
|
||||
for facetName, f in post_result.facets.items():
|
||||
if f._type == "statistical":
|
||||
continue
|
||||
if not f.terms:
|
||||
|
@ -59,7 +54,7 @@ def post(es, FromES, limit):
|
|||
except Exception, e:
|
||||
Log.error("Error with FromES", e)
|
||||
|
||||
return postResult
|
||||
return post_result
|
||||
|
||||
|
||||
def build_es_query(query):
|
||||
|
@ -86,90 +81,7 @@ def build_es_query(query):
|
|||
return output
|
||||
|
||||
|
||||
def parse_columns(parent_path, esProperties):
|
||||
"""
|
||||
RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
|
||||
"""
|
||||
columns = DictList()
|
||||
for name, property in esProperties.items():
|
||||
if parent_path:
|
||||
path = join_field(split_field(parent_path) + [name])
|
||||
else:
|
||||
path = name
|
||||
|
||||
if property.type == "nested" and property.properties:
|
||||
# NESTED TYPE IS A NEW TYPE DEFINITION
|
||||
# MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH
|
||||
child_columns = deepcopy(parse_columns(path, property.properties))
|
||||
self_columns = deepcopy(child_columns)
|
||||
for c in self_columns:
|
||||
c.depth += 1
|
||||
columns.extend(self_columns)
|
||||
columns.append({
|
||||
"name": join_field(split_field(path)[1::]),
|
||||
"type": "nested",
|
||||
"useSource": False
|
||||
})
|
||||
|
||||
if path not in INDEX_CACHE:
|
||||
pp = split_field(parent_path)
|
||||
for i in qb.reverse(range(len(pp))):
|
||||
c = INDEX_CACHE.get(join_field(pp[:i + 1]), None)
|
||||
if c:
|
||||
INDEX_CACHE[path] = c.copy()
|
||||
break
|
||||
else:
|
||||
Log.error("Can not find parent")
|
||||
|
||||
INDEX_CACHE[path].name = path
|
||||
INDEX_CACHE[path].columns = child_columns
|
||||
continue
|
||||
|
||||
if property.properties:
|
||||
child_columns = parse_columns(path, property.properties)
|
||||
columns.extend(child_columns)
|
||||
columns.append({
|
||||
"name": join_field(split_field(path)[1::]),
|
||||
"type": "object",
|
||||
"useSource": False
|
||||
})
|
||||
|
||||
if property.dynamic:
|
||||
continue
|
||||
if not property.type:
|
||||
continue
|
||||
if property.type == "multi_field":
|
||||
property.type = property.fields[name].type # PULL DEFAULT TYPE
|
||||
for i, (n, p) in enumerate(property.fields.items()):
|
||||
if n == name:
|
||||
# DEFAULT
|
||||
columns.append({"name": join_field(split_field(path)[1::]), "type": p.type, "useSource": p.index == "no"})
|
||||
else:
|
||||
columns.append({"name": join_field(split_field(path)[1::]) + "." + n, "type": p.type, "useSource": p.index == "no"})
|
||||
continue
|
||||
|
||||
if property.type in ["string", "boolean", "integer", "date", "long", "double"]:
|
||||
columns.append({
|
||||
"name": join_field(split_field(path)[1::]),
|
||||
"type": property.type,
|
||||
"useSource": property.index == "no"
|
||||
})
|
||||
if property.index_name and name != property.index_name:
|
||||
columns.append({
|
||||
"name": property.index_name,
|
||||
"type": property.type,
|
||||
"useSource": property.index == "no"
|
||||
})
|
||||
elif property.enabled == None or property.enabled == False:
|
||||
columns.append({
|
||||
"name": join_field(split_field(path)[1::]),
|
||||
"type": "object",
|
||||
"useSource": True
|
||||
})
|
||||
else:
|
||||
Log.warning("unknown type {{type}} for property {{path}}", type= property.type, path= path)
|
||||
|
||||
return columns
|
||||
|
||||
|
||||
def compileTime2Term(edge):
|
||||
|
@ -200,7 +112,7 @@ def compileTime2Term(edge):
|
|||
if Math.round(value) == 0:
|
||||
return edge.domain.NULL
|
||||
|
||||
d = datetime(str(value)[:4:], str(value).right(2), 1)
|
||||
d = datetime(str(value)[:4:], str(value)[-2:], 1)
|
||||
d = d.addMilli(offset)
|
||||
return edge.domain.getPartByKey(d)
|
||||
else:
|
||||
|
|
|
@ -12,21 +12,22 @@ from __future__ import division
|
|||
from __future__ import absolute_import
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.collections import MAX
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import listwrap, Dict, wrap, literal_field, set_default, coalesce, Null, split_field, join_field
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import qb, es09
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.domains import PARTITION, SimpleSetDomain, is_keyword
|
||||
from pyLibrary.queries.es14.util import aggregates1_4
|
||||
from pyLibrary.queries.domains import PARTITION, SimpleSetDomain, is_keyword, DefaultDomain
|
||||
from pyLibrary.queries.es14.util import aggregates1_4, NON_STATISTICAL_AGGS
|
||||
from pyLibrary.queries.expressions import simplify_esfilter, qb_expression_to_ruby, get_all_vars
|
||||
from pyLibrary.queries.query import DEFAULT_LIMIT
|
||||
from pyLibrary.times.timer import Timer
|
||||
|
||||
|
||||
def is_aggsop(es, query):
|
||||
es.cluster.get_metadata()
|
||||
if any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])) and (query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate)):
|
||||
if any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])) and (query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate)):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -40,24 +41,60 @@ def es_aggsop(es, frum, query):
|
|||
for s in select:
|
||||
if s.aggregate == "count" and (s.value == None or s.value == "."):
|
||||
s.pull = "doc_count"
|
||||
elif s.value == ".":
|
||||
if frum.typed:
|
||||
# STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING
|
||||
if s.aggregate in NON_STATISTICAL_AGGS:
|
||||
#TODO: HANDLE BOTH $value AND $objects TO COUNT
|
||||
Log.error("do not know how to handle")
|
||||
else:
|
||||
s.value = "$value"
|
||||
new_select["$value"] += [s]
|
||||
else:
|
||||
if s.aggregate in NON_STATISTICAL_AGGS:
|
||||
#TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT
|
||||
Log.error("do not know how to handle")
|
||||
else:
|
||||
Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate)
|
||||
elif is_keyword(s.value):
|
||||
new_select[literal_field(s.value)] += [s]
|
||||
else:
|
||||
formula.append(s)
|
||||
|
||||
for litral_field, many in new_select.items():
|
||||
if len(many)>1:
|
||||
canonical_name=literal_field(many[0].name)
|
||||
es_query.aggs[canonical_name].stats.field = many[0].value
|
||||
for canonical_name, many in new_select.items():
|
||||
representative = many[0]
|
||||
if representative.value == ".":
|
||||
Log.error("do not know how to handle")
|
||||
else:
|
||||
field_name = representative.value
|
||||
|
||||
if len(many) > 1 or many[0].aggregate in ("median", "percentile"):
|
||||
# canonical_name=literal_field(many[0].name)
|
||||
for s in many:
|
||||
if s.aggregate == "count":
|
||||
s.pull = canonical_name + ".count"
|
||||
es_query.aggs[literal_field(canonical_name)].stats.field = field_name
|
||||
s.pull = literal_field(canonical_name) + ".count"
|
||||
elif s.aggregate == "median":
|
||||
#ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
|
||||
key=literal_field(canonical_name + " percentile")
|
||||
|
||||
es_query.aggs[key].percentiles.field = field_name
|
||||
es_query.aggs[key].percentiles.percents += [50]
|
||||
s.pull = key + ".values.50\.0"
|
||||
elif s.aggregate == "percentile":
|
||||
#ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
|
||||
key=literal_field(canonical_name + " percentile")
|
||||
percent = Math.round(s.percentile * 100, decimal=6)
|
||||
|
||||
es_query.aggs[key].percentiles.field = field_name
|
||||
es_query.aggs[key].percentiles.percents += [percent]
|
||||
s.pull = key + ".values." + literal_field(unicode(percent))
|
||||
else:
|
||||
s.pull = canonical_name + "." + aggregates1_4[s.aggregate]
|
||||
es_query.aggs[literal_field(canonical_name)].stats.field = field_name
|
||||
s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate]
|
||||
else:
|
||||
s = many[0]
|
||||
s.pull = literal_field(s.value) + ".value"
|
||||
es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value
|
||||
es_query.aggs[literal_field(canonical_name)][aggregates1_4[representative.aggregate]].field = field_name
|
||||
representative.pull = literal_field(canonical_name) + ".value"
|
||||
|
||||
for i, s in enumerate(formula):
|
||||
new_select[unicode(i)] = s
|
||||
|
@ -71,6 +108,8 @@ def es_aggsop(es, frum, query):
|
|||
start += d.num_columns
|
||||
|
||||
if query.where:
|
||||
#TODO: INCLUDE FILTERS ON EDGES
|
||||
|
||||
filter = simplify_esfilter(query.where)
|
||||
es_query = Dict(
|
||||
aggs={"_filter": set_default({"filter": filter}, es_query)}
|
||||
|
@ -79,13 +118,18 @@ def es_aggsop(es, frum, query):
|
|||
if len(split_field(frum.name)) > 1:
|
||||
es_query = wrap({
|
||||
"size": 0,
|
||||
"aggs": {"_nested": set_default({
|
||||
"aggs": {"_nested": set_default(
|
||||
{
|
||||
"nested": {
|
||||
"path": join_field(split_field(frum.name)[1::])
|
||||
"path": frum.query_path
|
||||
}
|
||||
}, es_query)}
|
||||
},
|
||||
es_query
|
||||
)}
|
||||
})
|
||||
|
||||
es_query.size=0
|
||||
|
||||
with Timer("ES query time") as es_duration:
|
||||
result = es09.util.post(es, es_query, query.limit)
|
||||
|
||||
|
@ -109,10 +153,35 @@ def es_aggsop(es, frum, query):
|
|||
|
||||
|
||||
class AggsDecoder(object):
|
||||
def __new__(cls, *args, **kwargs):
|
||||
e = args[0]
|
||||
def __new__(cls, e=None, query=None, *args, **kwargs):
|
||||
if query.groupby:
|
||||
# GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE
|
||||
e.allowNulls = False
|
||||
else:
|
||||
e.allowNulls = coalesce(e.allowNulls, True)
|
||||
|
||||
if e.value and e.domain.type == "default":
|
||||
if query.groupby:
|
||||
return object.__new__(DefaultDecoder, e.copy())
|
||||
|
||||
if is_keyword(e.value):
|
||||
cols = query.frum.get_columns()
|
||||
col = cols.filter(lambda c: c.name == e.value)[0]
|
||||
if not col:
|
||||
return object.__new__(DefaultDecoder, e.copy())
|
||||
limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)
|
||||
|
||||
if col.partitions != None:
|
||||
e.domain = SimpleSetDomain(partitions=col.partitions[:limit:])
|
||||
else:
|
||||
e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict())
|
||||
return object.__new__(DefaultDecoder, e.copy())
|
||||
|
||||
elif isinstance(e.value, (list, Mapping)):
|
||||
Log.error("Not supported yet")
|
||||
else:
|
||||
return object.__new__(DefaultDecoder, e.copy())
|
||||
|
||||
if e.value and e.domain.type in PARTITION:
|
||||
return object.__new__(SetDecoder, e)
|
||||
if isinstance(e.domain.dimension, Dimension):
|
||||
|
@ -167,9 +236,32 @@ class AggsDecoder(object):
|
|||
class SetDecoder(AggsDecoder):
|
||||
def append_query(self, es_query, start):
|
||||
self.start = start
|
||||
domain = self.edge.domain
|
||||
|
||||
include = [p[domain.key] for p in domain.partitions]
|
||||
if self.edge.allowNulls:
|
||||
|
||||
return wrap({"aggs": {
|
||||
"_match": set_default({"terms": {"field": self.edge.value}}, es_query),
|
||||
"_missing": set_default({"missing": {"field": self.edge.value}}, es_query),
|
||||
"_match": set_default({"terms": {
|
||||
"field": self.edge.value,
|
||||
"size": 0,
|
||||
"include": include
|
||||
}}, es_query),
|
||||
"_missing": set_default(
|
||||
{"filter": {"or": [
|
||||
{"missing": {"field": self.edge.value}},
|
||||
{"not": {"terms": {self.edge.value: include}}}
|
||||
]}},
|
||||
es_query
|
||||
),
|
||||
}})
|
||||
else:
|
||||
return wrap({"aggs": {
|
||||
"_match": set_default({"terms": {
|
||||
"field": self.edge.value,
|
||||
"size": 0,
|
||||
"include": include
|
||||
}}, es_query)
|
||||
}})
|
||||
|
||||
def get_value(self, index):
|
||||
|
@ -216,7 +308,7 @@ def _range_composer(edge, domain, es_query, to_float):
|
|||
missing_filter = set_default(
|
||||
{"filter": {"or": [
|
||||
missing_range,
|
||||
{"missing": {"field": get_all_vars(edge.value)}}
|
||||
{"or": [{"missing": {"field": v}} for v in get_all_vars(edge.value)]}
|
||||
]}},
|
||||
es_query
|
||||
)
|
||||
|
@ -332,7 +424,7 @@ class DefaultDecoder(SetDecoder):
|
|||
def __init__(self, edge, query):
|
||||
AggsDecoder.__init__(self, edge, query)
|
||||
self.edge = self.edge.copy()
|
||||
self.edge.allowNulls = False # SINCE WE DO NOT KNOW THE DOMAIN, WE HAVE NO SENSE OF WHAT IS OUTSIDE THAT DOMAIN, allowNulls==True MAKES NO SENSE
|
||||
# self.edge.allowNulls = False # SINCE WE DO NOT KNOW THE DOMAIN, WE HAVE NO SENSE OF WHAT IS OUTSIDE THAT DOMAIN, allowNulls==True MAKES NO SENSE
|
||||
self.edge.domain.partitions = set()
|
||||
self.edge.domain.limit = coalesce(self.edge.domain.limit, query.limit, 10)
|
||||
|
||||
|
|
|
@ -0,0 +1,204 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http:# mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from pyLibrary import queries
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import split_field, DictList, listwrap, literal_field, wrap, coalesce, Dict
|
||||
from pyLibrary.queries import es09
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.es14.setop import format_dispatch
|
||||
from pyLibrary.queries.es14.util import qb_sort_to_es_sort
|
||||
|
||||
from pyLibrary.queries.expressions import query_get_all_vars, qb_expression_to_ruby, expression_map, qb_expression_to_esfilter
|
||||
from pyLibrary.queries.unique_index import UniqueIndex
|
||||
from pyLibrary.thread.threads import Thread
|
||||
from pyLibrary.times.timer import Timer
|
||||
|
||||
|
||||
def is_deepop(es, query):
|
||||
if query.edges or query.groupby:
|
||||
return False
|
||||
vars = query_get_all_vars(query)
|
||||
columns = query.frum.get_columns()
|
||||
if len(split_field(query.frum.name)) > 1:
|
||||
return True
|
||||
if any(c for c in columns if c.nested_path and c.name in vars):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def es_deepop(es, query):
|
||||
columns = query.frum.get_columns()
|
||||
query_path = query.frum.query_path
|
||||
columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(b.nested_path), len(a.nested_path))), fail_on_dup=False)
|
||||
_map = {c.name: c.abs_name for c in columns}
|
||||
where = qb_expression_to_esfilter(expression_map(query.where, _map))
|
||||
more_filter = {
|
||||
"and": [
|
||||
where,
|
||||
{"not": {
|
||||
"nested": {
|
||||
"path": query_path,
|
||||
"filter": {
|
||||
"match_all": {}
|
||||
}
|
||||
}
|
||||
}}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
es_query = wrap({
|
||||
"query": {
|
||||
"nested": {
|
||||
"path": query_path,
|
||||
"inner_hits": {},
|
||||
"filter": where
|
||||
},
|
||||
},
|
||||
"fields": []
|
||||
})
|
||||
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
|
||||
es_query.sort = qb_sort_to_es_sort(query.sort)
|
||||
|
||||
is_list = isinstance(query.select, list)
|
||||
new_select = DictList()
|
||||
|
||||
def get_pull(column):
|
||||
if column.nested_path:
|
||||
return "_inner" + column.abs_name[len(column.nested_path[0]):]
|
||||
else:
|
||||
return "fields." + literal_field(column.abs_name)
|
||||
|
||||
i = 0
|
||||
for s in listwrap(query.select):
|
||||
if s.value == "*":
|
||||
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
|
||||
for c in columns:
|
||||
if c.relative and c.type not in ["nested", "object"]:
|
||||
if not c.nested_path:
|
||||
es_query.fields.append(c.abs_name)
|
||||
new_select.append({
|
||||
"name": c.name,
|
||||
"pull": get_pull(c),
|
||||
"nested_path": c.nested_path[0],
|
||||
"put": {"name": c.name, "index": i, "child": "."}
|
||||
})
|
||||
i += 1
|
||||
# REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
|
||||
|
||||
col_names = [c.name for c in columns if c.relative]
|
||||
for n in new_select:
|
||||
if n.name.startswith("..") and n.name.lstrip(".") not in col_names:
|
||||
n.name = n.put.name = n.name.lstrip(".")
|
||||
elif s.value == ".":
|
||||
for c in columns:
|
||||
if c.relative and c.type not in ["nested", "object"]:
|
||||
if not c.nested_path:
|
||||
es_query.fields.append(c.abs_name)
|
||||
new_select.append({
|
||||
"name": c.name,
|
||||
"pull": get_pull(c),
|
||||
"nested_path": c.nested_path[0],
|
||||
"put": {"name": ".", "index": i, "child": c.abs_name}
|
||||
})
|
||||
i += 1
|
||||
elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]):
|
||||
parent = s.value[:-1]
|
||||
prefix = len(parent)
|
||||
for c in columns:
|
||||
if c.name.startswith(parent):
|
||||
pull = get_pull(c)
|
||||
if len(c.nested_path) < 0:
|
||||
es_query.fields.append(c.abs_name)
|
||||
new_select.append({
|
||||
"name": s.name + "." + c.name[prefix:],
|
||||
"pull": pull,
|
||||
"nested_path": c.nested_path[0],
|
||||
"put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."}
|
||||
})
|
||||
elif isinstance(s.value, basestring) and is_keyword(s.value):
|
||||
parent = s.value + "."
|
||||
prefix = len(parent)
|
||||
net_columns = [c for c in columns if c.name.startswith(parent)]
|
||||
if not net_columns:
|
||||
c = columns[(s.value,)]
|
||||
pull = get_pull(c)
|
||||
if not c.nested_path:
|
||||
es_query.fields.append(s.value)
|
||||
new_select.append({
|
||||
"name": s.name if is_list else ".",
|
||||
"pull": pull,
|
||||
"nested_path": c.nested_path[0],
|
||||
"put": {"name": s.name, "index": i, "child": "."}
|
||||
})
|
||||
else:
|
||||
for n in net_columns:
|
||||
pull = get_pull(n)
|
||||
if not n.nested_path:
|
||||
es_query.fields.append(n.abs_name)
|
||||
new_select.append({
|
||||
"name": s.name if is_list else ".",
|
||||
"pull": pull,
|
||||
"nested_path": n.nested_path[0],
|
||||
"put": {"name": s.name, "index": i, "child": n[prefix:]}
|
||||
})
|
||||
i += 1
|
||||
elif isinstance(s.value, list):
|
||||
Log.error("need an example")
|
||||
es_query.fields.extend([v for v in s.value])
|
||||
else:
|
||||
Log.error("need an example")
|
||||
es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)}
|
||||
new_select.append({
|
||||
"name": s.name if is_list else ".",
|
||||
"value": s.name,
|
||||
"put": {"name": s.name, "index": i, "child": "."}
|
||||
})
|
||||
i += 1
|
||||
|
||||
|
||||
more = []
|
||||
def get_more(please_stop):
|
||||
more.append(es09.util.post(
|
||||
es,
|
||||
Dict(
|
||||
query={"filtered": {"filter": more_filter}},
|
||||
fields=es_query.fields
|
||||
),
|
||||
query.limit
|
||||
))
|
||||
need_more=Thread.run("get more", target=get_more)
|
||||
|
||||
with Timer("call to ES") as call_timer:
|
||||
data = es09.util.post(es, es_query, query.limit)
|
||||
|
||||
# RETURN A LIST OF INNER OBJECTS
|
||||
def inners():
|
||||
for t in data.hits.hits:
|
||||
for i in t.inner_hits[query_path].hits.hits:
|
||||
t._inner = i._source
|
||||
yield t
|
||||
Thread.join(need_more)
|
||||
for t in more[0].hits.hits:
|
||||
yield t
|
||||
|
||||
try:
|
||||
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
|
||||
|
||||
output = formatter(inners(), new_select, query)
|
||||
output.meta.es_response_time = call_timer.duration
|
||||
output.meta.content_type = mime_type
|
||||
output.meta.es_query = es_query
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("problem formatting", e)
|
|
@ -15,6 +15,7 @@ from pyLibrary import convert
|
|||
from pyLibrary.collections.matrix import Matrix
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict, set_default, coalesce, wrap
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.containers.cube import Cube
|
||||
from pyLibrary.queries.es14.aggs import count_dim, aggs_iterator, format_dispatch
|
||||
|
||||
|
@ -27,7 +28,7 @@ def format_cube(decoders, aggs, start, query, select):
|
|||
coord = tuple(d.get_index(row) for d in decoders)
|
||||
for s, m in matricies:
|
||||
try:
|
||||
if m[coord]:
|
||||
if m[coord]: # THIS CAN HAPPEN WHEN THE SET QUERIED IS SMALLER THAN THE AVAILABLE IN ES
|
||||
Log.error("Not expected")
|
||||
m[coord] = agg[s.pull]
|
||||
except Exception, e:
|
||||
|
@ -115,6 +116,8 @@ def format_table_from_aggop(decoders, aggs, start, query, select):
|
|||
|
||||
row = []
|
||||
for s in select:
|
||||
if not s.pull:
|
||||
Log.error("programmer error")
|
||||
row.append(agg[s.pull])
|
||||
|
||||
return Dict(
|
||||
|
@ -196,14 +199,23 @@ def format_list_from_aggop(decoders, aggs, start, query, select):
|
|||
agg = b
|
||||
b = coalesce(agg._filter, agg._nested)
|
||||
|
||||
if isinstance(query.select, list):
|
||||
item = Dict()
|
||||
for s in select:
|
||||
item[s.name] = agg[s.pull]
|
||||
else:
|
||||
item = agg[select[0].pull]
|
||||
|
||||
if query.edges or query.groupby:
|
||||
return wrap({
|
||||
"meta": {"format": "list"},
|
||||
"data": [item]
|
||||
})
|
||||
else:
|
||||
return wrap({
|
||||
"meta": {"format": "value"},
|
||||
"data": item
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -10,108 +10,30 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary import queries
|
||||
from pyLibrary.collections.matrix import Matrix
|
||||
from pyLibrary.collections import AND, UNION
|
||||
from pyLibrary.dot import coalesce, split_field, set_default, Dict, unwraplist, literal_field
|
||||
from pyLibrary.collections import AND
|
||||
from pyLibrary.dot import coalesce, split_field, set_default, Dict, unwraplist, literal_field, join_field, unwrap
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import listwrap
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries import domains
|
||||
from pyLibrary.queries.expressions import qb_expression_to_esfilter, simplify_esfilter, qb_expression_to_ruby
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.queries import domains, es14, es09, qb
|
||||
from pyLibrary.queries.containers.cube import Cube
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.es14.util import qb_sort_to_es_sort
|
||||
from pyLibrary.queries.expressions import qb_expression_to_esfilter, simplify_esfilter, qb_expression_to_ruby
|
||||
from pyLibrary.queries.query import DEFAULT_LIMIT
|
||||
from pyLibrary.times.timer import Timer
|
||||
from pyLibrary.queries import es14, es09
|
||||
|
||||
|
||||
format_dispatch = {}
|
||||
|
||||
def is_fieldop(es, query):
|
||||
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
|
||||
return False
|
||||
|
||||
# THESE SMOOTH EDGES REQUIRE ALL DATA (SETOP)
|
||||
select = listwrap(query.select)
|
||||
if not query.edges:
|
||||
isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT
|
||||
isSimple = AND(s.value != None and (s.value in ["*", "."] or is_keyword(s.value)) for s in select)
|
||||
noAgg = AND(s.aggregate == "none" for s in select)
|
||||
|
||||
if not isDeep and isSimple and noAgg:
|
||||
return True
|
||||
else:
|
||||
isSmooth = AND((e.domain.type in domains.ALGEBRAIC and e.domain.interval == "none") for e in query.edges)
|
||||
if isSmooth:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def es_fieldop(es, query):
|
||||
es_query, es_filter = es14.util.es_query_template(query.frum.name)
|
||||
es_query[es_filter]=simplify_esfilter(qb_expression_to_esfilter(query.where))
|
||||
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
|
||||
es_query.sort = qb_sort_to_es_sort(query.sort)
|
||||
es_query.fields = DictList()
|
||||
|
||||
source = "fields"
|
||||
|
||||
select = listwrap(query.select)
|
||||
for s in select.value:
|
||||
if s == "*":
|
||||
es_query.fields=None
|
||||
source = "_source"
|
||||
elif s == ".":
|
||||
es_query.fields=None
|
||||
source = "_source"
|
||||
elif isinstance(s, basestring) and is_keyword(s):
|
||||
es_query.fields.append(s)
|
||||
elif isinstance(s, list) and es_query.fields is not None:
|
||||
es_query.fields.extend(s)
|
||||
elif isinstance(s, Mapping) and es_query.fields is not None:
|
||||
es_query.fields.extend(s.values())
|
||||
elif es_query.fields is not None:
|
||||
es_query.fields.append(s)
|
||||
es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort]
|
||||
|
||||
return extract_rows(es, es_query, source, select, query)
|
||||
|
||||
|
||||
def extract_rows(es, es_query, source, select, query):
|
||||
with Timer("call to ES") as call_timer:
|
||||
data = es09.util.post(es, es_query, query.limit)
|
||||
|
||||
T = data.hits.hits
|
||||
for i, s in enumerate(select.copy()):
|
||||
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
|
||||
if s.value == "*":
|
||||
try:
|
||||
column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"] or c.useSource) and not c.depth)
|
||||
except Exception, e:
|
||||
Log.warning("can not get columns", e)
|
||||
column_names = UNION(*[[k for k, v in row.items()] for row in T.select(source)])
|
||||
column_names -= set(select.name)
|
||||
select = select[:i:] + [{"name": n, "value": n} for n in column_names] + select[i + 1::]
|
||||
break
|
||||
|
||||
try:
|
||||
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
|
||||
|
||||
output = formatter(T, select, source)
|
||||
output.meta.es_response_time = call_timer.duration
|
||||
output.meta.content_type = mime_type
|
||||
output.meta.es_query = es_query
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("problem formatting", e)
|
||||
|
||||
|
||||
def is_setop(es, query):
|
||||
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
|
||||
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
|
||||
return False
|
||||
|
||||
select = listwrap(query.select)
|
||||
|
@ -133,70 +55,140 @@ def is_setop(es, query):
|
|||
|
||||
def es_setop(es, query):
|
||||
es_query, es_filter = es14.util.es_query_template(query.frum.name)
|
||||
es_query[es_filter]=simplify_esfilter(qb_expression_to_esfilter(query.where))
|
||||
es_query[es_filter] = simplify_esfilter(qb_expression_to_esfilter(query.where))
|
||||
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
|
||||
es_query.fields = DictList()
|
||||
es_query.sort = qb_sort_to_es_sort(query.sort)
|
||||
es_query.fields = DictList()
|
||||
|
||||
return extract_rows(es, es_query, query)
|
||||
|
||||
|
||||
def extract_rows(es, es_query, query):
|
||||
is_list = isinstance(query.select, list)
|
||||
new_select = DictList()
|
||||
column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"]) and not c.nested_path)
|
||||
source = "fields"
|
||||
select = listwrap(query.select)
|
||||
for s in select:
|
||||
|
||||
i = 0
|
||||
for s in listwrap(query.select):
|
||||
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
|
||||
if s.value == "*":
|
||||
es_query.fields = None
|
||||
es_query.script_fields = None
|
||||
source = "_source"
|
||||
|
||||
net_columns = column_names - set(listwrap(query.select).name)
|
||||
for n in net_columns:
|
||||
new_select.append({"name": n, "value": n, "put": {"name": n, "index": i, "child": "."}})
|
||||
i += 1
|
||||
elif s.value == ".":
|
||||
es_query.fields = None
|
||||
es_query.script_fields = None
|
||||
source = "_source"
|
||||
|
||||
new_select.append({"name": s.name if is_list else ".", "value": s.value, "put": {"name": s.name, "index": i, "child": "."}})
|
||||
i += 1
|
||||
elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]):
|
||||
parent = s.value[:-1]
|
||||
prefix = len(parent)
|
||||
for c in column_names:
|
||||
if c.startswith(parent):
|
||||
if es_query.fields is not None:
|
||||
es_query.fields.append(c)
|
||||
|
||||
new_select.append({"name": s.name+"."+c[prefix:], "value": c, "put": {"name": s.name+"."+c[prefix:], "index": i, "child": "."}})
|
||||
i += 1
|
||||
elif isinstance(s.value, basestring) and is_keyword(s.value):
|
||||
parent = s.value + "."
|
||||
prefix = len(parent)
|
||||
net_columns = [c for c in column_names if c.startswith(parent)]
|
||||
if not net_columns:
|
||||
if es_query.fields is not None:
|
||||
es_query.fields.append(s.value)
|
||||
elif isinstance(s.value, list) and es_query.fields is not None:
|
||||
es_query.fields.extend(s.value)
|
||||
new_select.append({"name": s.name if is_list else ".", "value": s.value, "put": {"name": s.name, "index": i, "child": "."}})
|
||||
else:
|
||||
for n in net_columns:
|
||||
if es_query.fields is not None:
|
||||
es_query.fields.append(n)
|
||||
new_select.append({"name": s.name if is_list else ".", "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]}})
|
||||
i += 1
|
||||
elif isinstance(s.value, list):
|
||||
Log.error("need an example")
|
||||
if es_query.fields is not None:
|
||||
es_query.fields.extend([v for v in s.value])
|
||||
else:
|
||||
es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)}
|
||||
new_select.append({
|
||||
"name": s.name if is_list else ".",
|
||||
"pull": "fields." + literal_field(s.name),
|
||||
"put": {"name": s.name, "index": i, "child": "."}
|
||||
})
|
||||
i += 1
|
||||
|
||||
return extract_rows(es, es_query, source, select, query)
|
||||
for n in new_select:
|
||||
if n.pull:
|
||||
continue
|
||||
if source == "_source":
|
||||
n.pull = join_field(["_source"] + split_field(n.value))
|
||||
else:
|
||||
n.pull = "fields." + literal_field(n.value)
|
||||
|
||||
with Timer("call to ES") as call_timer:
|
||||
data = es09.util.post(es, es_query, query.limit)
|
||||
|
||||
T = data.hits.hits
|
||||
|
||||
try:
|
||||
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
|
||||
|
||||
output = formatter(T, new_select, query)
|
||||
output.meta.es_response_time = call_timer.duration
|
||||
output.meta.content_type = mime_type
|
||||
output.meta.es_query = es_query
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("problem formatting", e)
|
||||
|
||||
|
||||
def format_list(T, select, source):
|
||||
|
||||
def format_list(T, select, query=None):
|
||||
data = []
|
||||
for row in T:
|
||||
r = Dict(_id=row._id)
|
||||
r = Dict()
|
||||
for s in select:
|
||||
if s.value == ".":
|
||||
r[s.name] = row[source]
|
||||
else:
|
||||
if source=="_source":
|
||||
r[s.name] = unwraplist(row[source][s.value])
|
||||
elif isinstance(s.value, basestring): # fields
|
||||
r[s.name] = unwraplist(row[source][literal_field(s.value)])
|
||||
else:
|
||||
r[s.name] = unwraplist(row[source][literal_field(s.name)])
|
||||
data.append(r)
|
||||
r[s.name][s.put.child] = unwraplist(row[s.pull])
|
||||
data.append(r if r else None)
|
||||
return Dict(
|
||||
meta={"format": "list"},
|
||||
data=data
|
||||
)
|
||||
|
||||
|
||||
def format_table(T, select, source):
|
||||
header = [s.name for s in select]
|
||||
map = {s.name: i for i, s in enumerate(select)} # MAP FROM name TO COLUMN INDEX
|
||||
def format_table(T, select, query=None):
|
||||
data = []
|
||||
num_columns = (Math.MAX(select.put.index)+1)
|
||||
for row in T:
|
||||
r = [None] * len(header)
|
||||
r = [None] * num_columns
|
||||
for s in select:
|
||||
if s.value == ".":
|
||||
r[map[s.name]] = row[source]
|
||||
value = unwraplist(row[s.pull])
|
||||
|
||||
if value == None:
|
||||
continue
|
||||
|
||||
index, child = s.put.index, s.put.child
|
||||
if child == ".":
|
||||
r[index] = value
|
||||
else:
|
||||
if source == "_source":
|
||||
r[map[s.name]] = unwraplist(row[source][s.value])
|
||||
elif isinstance(s.value, basestring): # fields
|
||||
r[map[s.name]] = unwraplist(row[source][literal_field(s.value)])
|
||||
else:
|
||||
r[map[s.name]] = unwraplist(row[source][literal_field(s.name)])
|
||||
if r[index] is None:
|
||||
r[index] = Dict()
|
||||
r[index][child] = value
|
||||
|
||||
data.append(r)
|
||||
|
||||
header = [None]*num_columns
|
||||
for s in select:
|
||||
if header[s.put.index]:
|
||||
continue
|
||||
header[s.put.index] = s.put.name
|
||||
|
||||
return Dict(
|
||||
meta={"format": "table"},
|
||||
header=header,
|
||||
|
@ -204,26 +196,22 @@ def format_table(T, select, source):
|
|||
)
|
||||
|
||||
|
||||
def format_cube(T, select, source):
|
||||
matricies = {}
|
||||
for s in select:
|
||||
try:
|
||||
if s.value == ".":
|
||||
matricies[s.name] = Matrix.wrap(T.select(source))
|
||||
elif isinstance(s.value, list):
|
||||
matricies[s.name] = Matrix.wrap([tuple(unwraplist(t[source][ss]) for ss in s.value) for t in T])
|
||||
else:
|
||||
if source == "_source":
|
||||
matricies[s.name] = Matrix.wrap([unwraplist(t[source][s.value]) for t in T])
|
||||
def format_cube(T, select, query=None):
|
||||
table = format_table(T, select, query)
|
||||
|
||||
elif isinstance(s.value, basestring): # fields
|
||||
matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.value)) for t in T])
|
||||
else:
|
||||
matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.name)) for t in T])
|
||||
except Exception, e:
|
||||
Log.error("", e)
|
||||
cube = Cube(select, edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(T), "interval": 1}}], data=matricies)
|
||||
return cube
|
||||
if len(table.data) == 0:
|
||||
return Cube(
|
||||
select,
|
||||
edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": 0, "interval": 1}}],
|
||||
data={h: Matrix(list=[]) for i, h in enumerate(table.header)}
|
||||
)
|
||||
|
||||
cols = zip(*unwrap(table.data))
|
||||
return Cube(
|
||||
select,
|
||||
edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(table.data), "interval": 1}}],
|
||||
data={h: Matrix(list=cols[i]) for i, h in enumerate(table.header)}
|
||||
)
|
||||
|
||||
|
||||
set_default(format_dispatch, {
|
||||
|
|
|
@ -11,10 +11,15 @@ from __future__ import unicode_literals
|
|||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from pyLibrary.dot import wrap, join_field, split_field
|
||||
from pyLibrary.dot import wrap, split_field, join_field
|
||||
|
||||
|
||||
def es_query_template(path):
|
||||
"""
|
||||
RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE
|
||||
:param path:
|
||||
:return:
|
||||
"""
|
||||
sub_path = split_field(path)[1:]
|
||||
|
||||
if sub_path:
|
||||
|
@ -34,24 +39,30 @@ def es_query_template(path):
|
|||
else:
|
||||
output = wrap({
|
||||
"query": {
|
||||
"filter": {},
|
||||
"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": {}
|
||||
}
|
||||
},
|
||||
"from": 0,
|
||||
"size": 0,
|
||||
"sort": []
|
||||
})
|
||||
return output, "query.filter"
|
||||
return output, "query.filtered.filter"
|
||||
|
||||
|
||||
|
||||
|
||||
def qb_sort_to_es_sort(sort):
|
||||
if not sort:
|
||||
return []
|
||||
|
||||
output = []
|
||||
for s in sort:
|
||||
if s.sort == 1:
|
||||
output.append(s.field)
|
||||
output.append(s.value)
|
||||
elif s.sort == -1:
|
||||
output.append({s.field: "desc"})
|
||||
output.append({s.value: "desc"})
|
||||
else:
|
||||
pass
|
||||
return output
|
||||
|
@ -71,6 +82,8 @@ aggregates1_4 = {
|
|||
"mean": "avg",
|
||||
"average": "avg",
|
||||
"avg": "avg",
|
||||
"median": "median",
|
||||
"percentile": "percentile",
|
||||
"N": "count",
|
||||
"X0": "count",
|
||||
"X1": "sum",
|
||||
|
@ -81,3 +94,5 @@ aggregates1_4 = {
|
|||
"variance": "variance"
|
||||
}
|
||||
|
||||
NON_STATISTICAL_AGGS = {"none", "one", "count"}
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ import itertools
|
|||
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.collections import OR
|
||||
from pyLibrary.dot import coalesce, wrap, set_default, literal_field
|
||||
from pyLibrary.dot import coalesce, wrap, set_default, literal_field, listwrap
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
|
@ -25,6 +25,16 @@ from pyLibrary.times.dates import Date
|
|||
TRUE_FILTER = True
|
||||
FALSE_FILTER = False
|
||||
|
||||
_Query = None
|
||||
|
||||
def _late_import():
|
||||
global _Query
|
||||
|
||||
from pyLibrary.queries.query import Query as _Query
|
||||
|
||||
_=_Query
|
||||
|
||||
|
||||
|
||||
|
||||
def compile_expression(source):
|
||||
|
@ -53,7 +63,7 @@ def qb_expression(expr):
|
|||
|
||||
|
||||
def qb_expression_to_function(expr):
|
||||
if expr!=None and not isinstance(expr, (Mapping, list)) and hasattr(expr, "__call__"):
|
||||
if expr != None and not isinstance(expr, (Mapping, list)) and hasattr(expr, "__call__"):
|
||||
return expr
|
||||
return compile_expression(qb_expression_to_python(expr))
|
||||
|
||||
|
@ -89,7 +99,10 @@ def qb_expression_to_ruby(expr):
|
|||
elif expr is False:
|
||||
return "false"
|
||||
|
||||
try:
|
||||
op, term = expr.items()[0]
|
||||
except Exception, e:
|
||||
Log.error("expecting expression (`{op: term}` format)")
|
||||
|
||||
mop = ruby_multi_operators.get(op)
|
||||
if mop:
|
||||
|
@ -115,20 +128,15 @@ def qb_expression_to_ruby(expr):
|
|||
elif isinstance(term, Mapping):
|
||||
if op == "eq":
|
||||
# eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE
|
||||
output = " and ".join("(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" for a, b in term.items())
|
||||
output = " and ".join("(" + qb_expression_to_ruby(var) + bop + convert.value2quote(val) + ")" for var, val in term.items())
|
||||
return output
|
||||
else:
|
||||
a, b = term.items()[0]
|
||||
output = "(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")"
|
||||
var, val = term.items()[0]
|
||||
output = "(" + qb_expression_to_ruby(var) + bop + convert.value2quote(val) + ")"
|
||||
return output
|
||||
else:
|
||||
Log.error("Expecting binary term")
|
||||
|
||||
uop = ruby_unary_operators.get(op)
|
||||
if uop:
|
||||
output = expand_template(uop, {"term": qb_expression_to_ruby(term)})
|
||||
return output
|
||||
|
||||
cop = complex_operators.get(op)
|
||||
if cop:
|
||||
output = cop(term).to_ruby()
|
||||
|
@ -144,7 +152,10 @@ def qb_expression_to_python(expr):
|
|||
return unicode(expr)
|
||||
elif isinstance(expr, Date):
|
||||
return unicode(expr.unix)
|
||||
elif isinstance(expr, unicode):
|
||||
elif isinstance(expr, basestring):
|
||||
if isinstance(expr, str):
|
||||
expr = convert.utf82unicode(expr)
|
||||
|
||||
if expr == ".":
|
||||
return "row"
|
||||
elif is_keyword(expr):
|
||||
|
@ -165,6 +176,8 @@ def qb_expression_to_python(expr):
|
|||
if isinstance(term, list):
|
||||
if not term:
|
||||
return mop[1] # RETURN DEFAULT
|
||||
elif len(term)==1:
|
||||
return qb_expression_to_python(term[0])
|
||||
else:
|
||||
output = mop[0].join(["(" + qb_expression_to_python(t) + ")" for t in term])
|
||||
return output
|
||||
|
@ -183,26 +196,32 @@ def qb_expression_to_python(expr):
|
|||
elif isinstance(term, Mapping):
|
||||
if op == "eq":
|
||||
# eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE
|
||||
output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" for a, b in term.items())
|
||||
output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + convert.value2json(b) for a, b in term.items())
|
||||
return output
|
||||
else:
|
||||
a, b = term.items()[0]
|
||||
output = "(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")"
|
||||
output = "(" + qb_expression_to_python(a) + ")" + bop + convert.value2json(b)
|
||||
return output
|
||||
else:
|
||||
Log.error("Expecting binary term")
|
||||
|
||||
uop = python_unary_operators.get(op)
|
||||
if uop:
|
||||
output = uop + "(" + qb_expression_to_python(term) + ")"
|
||||
cop = complex_operators.get(op)
|
||||
if cop:
|
||||
output = cop(op, term).to_python()
|
||||
return output
|
||||
|
||||
|
||||
Log.error("`{{op}}` is not a recognized operation", op= op)
|
||||
|
||||
|
||||
def get_all_vars(expr):
|
||||
if not _Query:
|
||||
_late_import()
|
||||
|
||||
if expr == None:
|
||||
return set()
|
||||
elif isinstance(expr, _Query):
|
||||
return query_get_all_vars(expr)
|
||||
elif isinstance(expr, unicode):
|
||||
if expr == "." or is_keyword(expr):
|
||||
return set([expr])
|
||||
|
@ -249,10 +268,6 @@ def get_all_vars(expr):
|
|||
else:
|
||||
Log.error("Expecting binary term")
|
||||
|
||||
uop = ruby_unary_operators.get(op)
|
||||
if uop:
|
||||
return get_all_vars(term)
|
||||
|
||||
cop = complex_operators.get(op)
|
||||
if cop:
|
||||
return cop(op, term).vars()
|
||||
|
@ -260,10 +275,134 @@ def get_all_vars(expr):
|
|||
Log.error("`{{op}}` is not a recognized operation", op= op)
|
||||
|
||||
|
||||
def expression_map(expr, map):
|
||||
"""
|
||||
USE map TO MAP VARIABLES NAMES TO SOME OTHER
|
||||
"""
|
||||
if expr == None:
|
||||
return expr
|
||||
elif Math.is_number(expr):
|
||||
return expr
|
||||
elif isinstance(expr, Date):
|
||||
return expr
|
||||
elif isinstance(expr, unicode):
|
||||
if expr == ".":
|
||||
return expr
|
||||
elif is_keyword(expr):
|
||||
return map.get(expr, expr)
|
||||
else:
|
||||
Log.error("Expecting a json path")
|
||||
elif isinstance(expr, CODE):
|
||||
return expr.code
|
||||
elif expr is True:
|
||||
return expr
|
||||
elif expr is False:
|
||||
return expr
|
||||
|
||||
op, term = expr.items()[0]
|
||||
|
||||
mop = python_multi_operators.get(op)
|
||||
if mop:
|
||||
output = map(expression_map, term)
|
||||
return output
|
||||
|
||||
bop = python_binary_operators.get(op)
|
||||
if bop:
|
||||
if isinstance(term, list):
|
||||
output = {op: map(expression_map, term)}
|
||||
return output
|
||||
elif isinstance(term, Mapping):
|
||||
output = {op: {expression_map(k, map): v for k, v, in term.items()}}
|
||||
return output
|
||||
else:
|
||||
Log.error("Expecting binary term")
|
||||
|
||||
Log.error("`{{op}}` is not a recognized operation", op=op)
|
||||
|
||||
|
||||
|
||||
|
||||
def query_get_all_vars(query, exclude_where=False):
|
||||
"""
|
||||
:param query:
|
||||
:param exclude_where: Sometimes we do not what to look at the where clause
|
||||
:return: all variables in use by query
|
||||
"""
|
||||
output = set()
|
||||
for s in listwrap(query.select):
|
||||
output |= select_get_all_vars(s)
|
||||
for s in listwrap(query.edges):
|
||||
output |= edges_get_all_vars(s)
|
||||
for s in listwrap(query.groupby):
|
||||
output |= edges_get_all_vars(s)
|
||||
if not exclude_where:
|
||||
output |= get_all_vars(query.where)
|
||||
return output
|
||||
|
||||
|
||||
def select_get_all_vars(s):
|
||||
if isinstance(s.value, list):
|
||||
return set(s.value)
|
||||
elif isinstance(s.value, basestring):
|
||||
return set([s.value])
|
||||
elif s.value == None or s.value == ".":
|
||||
return set()
|
||||
else:
|
||||
if s.value == "*":
|
||||
return set(["*"])
|
||||
return get_all_vars(s.value)
|
||||
|
||||
|
||||
def edges_get_all_vars(e):
|
||||
output = set()
|
||||
if isinstance(e.value, basestring):
|
||||
output.add(e.value)
|
||||
if e.domain.key:
|
||||
output.add(e.domain.key)
|
||||
if e.domain.where:
|
||||
output |= get_all_vars(e.domain.where)
|
||||
if e.domain.partitions:
|
||||
for p in e.domain.partitions:
|
||||
if p.where:
|
||||
output |= get_all_vars(p.where)
|
||||
return output
|
||||
|
||||
|
||||
def where_get_all_vars(w):
|
||||
if w in [True, False, None]:
|
||||
return []
|
||||
|
||||
output = set()
|
||||
key = list(w.keys())[0]
|
||||
val = w[key]
|
||||
if key in ["and", "or"]:
|
||||
for ww in val:
|
||||
output |= get_all_vars(ww)
|
||||
return output
|
||||
|
||||
if key == "not":
|
||||
return get_all_vars(val)
|
||||
|
||||
if key in ["exists", "missing"]:
|
||||
if isinstance(val, unicode):
|
||||
return {val}
|
||||
else:
|
||||
return {val.field}
|
||||
|
||||
if key in ["gte", "gt", "eq", "ne", "term", "terms", "lt", "lte", "range", "prefix"]:
|
||||
if not isinstance(val, Mapping):
|
||||
Log.error("Expecting `{{key}}` to have a dict value, not a {{type}}",
|
||||
key= key,
|
||||
type= val.__class__.__name__)
|
||||
return val.keys()
|
||||
|
||||
if key == "match_all":
|
||||
return set()
|
||||
|
||||
Log.error("do not know how to handle where {{where|json}}", {"where", w})
|
||||
|
||||
|
||||
|
||||
python_unary_operators = {
|
||||
"not": "not {{term}}",
|
||||
}
|
||||
|
||||
python_binary_operators = {
|
||||
"sub": " - ",
|
||||
|
@ -282,6 +421,23 @@ python_binary_operators = {
|
|||
"term": " == "
|
||||
}
|
||||
|
||||
ruby_binary_operators = {
|
||||
"sub": " - ",
|
||||
"subtract": " - ",
|
||||
"minus": " - ",
|
||||
"div": " / ",
|
||||
"divide": " / ",
|
||||
"exp": " ** ",
|
||||
"mod": " % ",
|
||||
"gt": " > ",
|
||||
"gte": " >= ",
|
||||
"eq": " == ",
|
||||
"lte": " <= ",
|
||||
"lt": " < ",
|
||||
"ne": " != ",
|
||||
"term": " == "
|
||||
}
|
||||
|
||||
python_multi_operators = {
|
||||
"add": (" + ", "0"), # (operator, zero-array default value) PAIR
|
||||
"sum": (" + ", "0"),
|
||||
|
@ -292,27 +448,6 @@ python_multi_operators = {
|
|||
"or": (" or ", "false")
|
||||
}
|
||||
|
||||
ruby_unary_operators = {
|
||||
"not": "! {{term}}",
|
||||
}
|
||||
|
||||
ruby_binary_operators = {
|
||||
"sub": " - ",
|
||||
"subtract": " - ",
|
||||
"minus": " - ",
|
||||
"div": " / ",
|
||||
"divide": " / ",
|
||||
"exp": " ** ",
|
||||
"mod": " % ",
|
||||
"gt": " > ",
|
||||
"gte": " >= ",
|
||||
"eq": " == ",
|
||||
"lte": " <= ",
|
||||
"lt": " < ",
|
||||
"ne": " != ",
|
||||
"term": " == "
|
||||
}
|
||||
|
||||
ruby_multi_operators = {
|
||||
"add": (" + ", "0"), # (operator, zero-array default value) PAIR
|
||||
"sum": (" + ", "0"),
|
||||
|
@ -334,10 +469,6 @@ default_multi_operators = {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class BinaryOp(object):
|
||||
def __init__(self, op, term):
|
||||
self.op = op
|
||||
|
@ -347,22 +478,23 @@ class BinaryOp(object):
|
|||
self.a, self.b = map(qb_expression, term.items()[0])
|
||||
|
||||
def to_ruby(self):
|
||||
symbol = ruby_multi_operators[self.op][0]
|
||||
symbol = ruby_binary_operators[self.op]
|
||||
return "(" + self.a.to_ruby() + ")" + symbol + "(" + self.b.to_ruby() + ")"
|
||||
|
||||
def to_python(self):
|
||||
symbol = python_multi_operators[self.op][0]
|
||||
symbol = python_binary_operators[self.op]
|
||||
return "(" + self.a.to_python() + ")" + symbol + "(" + self.b.to_python() + ")"
|
||||
|
||||
def to_esfilter(self):
|
||||
if self.op in ["gt", "gte", "lte", "lt"]:
|
||||
return {"range":{self.op: {self.a: self.b}}}
|
||||
return {"range": {self.op: {self.a: self.b}}}
|
||||
else:
|
||||
Log.error("Operator {{op}} is not supported by ES", op=self.op)
|
||||
|
||||
def vars(self):
|
||||
return self.a.vars() | self.b.vars()
|
||||
|
||||
|
||||
class MultiOp(object):
|
||||
def __init__(self, op, terms):
|
||||
self.op = op
|
||||
|
@ -391,6 +523,35 @@ class MultiOp(object):
|
|||
return output
|
||||
|
||||
|
||||
|
||||
_python_unary_operators = {
|
||||
"not": "not {{term}}",
|
||||
"length": 'len({{term}})',
|
||||
"number": 'float({{term}})',
|
||||
}
|
||||
_ruby_unary_operators = {
|
||||
"not": "! {{term}}",
|
||||
"length": '({{term}}).length()',
|
||||
"number": '({{term}}).to_f'
|
||||
}
|
||||
|
||||
class UnaryOp(object):
|
||||
def __init__(self, op, term):
|
||||
self.op = op
|
||||
self.term = qb_expression(term)
|
||||
|
||||
def to_ruby(self):
|
||||
pattern = _ruby_unary_operators[self.op]
|
||||
return expand_template(pattern, {"term": self.term.to_ruby()})
|
||||
|
||||
def to_python(self):
|
||||
pattern = _python_unary_operators[self.op]
|
||||
return expand_template(pattern, {"term": self.term.to_python()})
|
||||
|
||||
def vars(self):
|
||||
return self.term.vars()
|
||||
|
||||
|
||||
class RegExpOp(object):
|
||||
def __init__(self, op, term):
|
||||
self.var, self.pattern = term.items()[0]
|
||||
|
@ -420,6 +581,9 @@ class TermsOp(object):
|
|||
def vars(self):
|
||||
return {self.var}
|
||||
|
||||
def map(self, map):
|
||||
return {"terms": {map.get(self.var, self.var): self.vals}}
|
||||
|
||||
|
||||
class ExistsOp(object):
|
||||
def __init__(self, op, term):
|
||||
|
@ -440,6 +604,9 @@ class ExistsOp(object):
|
|||
def vars(self):
|
||||
return set([self.field])
|
||||
|
||||
def map(self, map):
|
||||
return {"exists": map.get(self.field, self.field)}
|
||||
|
||||
|
||||
class PrefixOp(object):
|
||||
def __init__(self, op, term):
|
||||
|
@ -457,6 +624,9 @@ class PrefixOp(object):
|
|||
def vars(self):
|
||||
return set([self.field])
|
||||
|
||||
def map(self, map):
|
||||
return {"prefix": {map.get(self.field, self.field): self.prefix}}
|
||||
|
||||
|
||||
class MissingOp(object):
|
||||
def __init__(self, op, term):
|
||||
|
@ -477,13 +647,16 @@ class MissingOp(object):
|
|||
def vars(self):
|
||||
return set([self.field])
|
||||
|
||||
def map(self, map):
|
||||
return {"missing": map.get(self.field, self.field)}
|
||||
|
||||
|
||||
class NotOp(object):
|
||||
def __init__(self, op, term):
|
||||
self.term = qb_expression(term)
|
||||
|
||||
def to_ruby(self):
|
||||
return "not " + self.term.to_ruby()
|
||||
return "! " + self.term.to_ruby()
|
||||
|
||||
def to_python(self):
|
||||
return "not" + self.term.to_python()
|
||||
|
@ -494,16 +667,19 @@ class NotOp(object):
|
|||
def vars(self):
|
||||
return self.term.vars()
|
||||
|
||||
def map(self, map):
|
||||
return {"not": self.term.map(map)}
|
||||
|
||||
|
||||
class RangeOp(object):
|
||||
def __init__(self, op, term):
|
||||
self.field, self.cmp = term.items()[0]
|
||||
|
||||
def to_ruby(self):
|
||||
return " and ".join(qb_expression_to_ruby([{o: {self.field: v}} for o, v in self.cmp.items()]))
|
||||
return " and ".join(qb_expression_to_ruby({"and": [{o: {self.field: v}} for o, v in self.cmp.items()]}))
|
||||
|
||||
def to_python(self):
|
||||
return " and ".join(qb_expression_to_python([{o: {self.field: v}} for o, v in self.cmp.items()]))
|
||||
return " and ".join(qb_expression_to_python({"and": [{o: {self.field: v}} for o, v in self.cmp.items()]}))
|
||||
|
||||
def to_esfilter(self):
|
||||
return {"range": {self.field, self.cmp}}
|
||||
|
@ -511,16 +687,19 @@ class RangeOp(object):
|
|||
def vars(self):
|
||||
return set([self.field])
|
||||
|
||||
def map(self, map):
|
||||
return {"range": {map.get(self.field, self.field): self.cmp}}
|
||||
|
||||
|
||||
class DocOp(object):
|
||||
"""
|
||||
A literal JSON document
|
||||
"""
|
||||
def __init__(self, term):
|
||||
def __init__(self, op, term):
|
||||
self.json = convert.value2json(term)
|
||||
|
||||
def to_ruby(self):
|
||||
def _convert(v, depth):
|
||||
def _convert(v):
|
||||
if v is None:
|
||||
return "nil"
|
||||
if v is True:
|
||||
|
@ -532,19 +711,11 @@ class DocOp(object):
|
|||
if isinstance(v, (int, long, float)):
|
||||
return unicode(v)
|
||||
if isinstance(v, dict):
|
||||
var_name = "output" + unicode(depth)
|
||||
return \
|
||||
"lambda {\n" + var_name + "={};\n" + \
|
||||
"".join(
|
||||
"" + var_name + "[" + convert.string2quote(k) + "]=" + _convert(vv, depth + 1) + ";\n" for k, vv in v.items()
|
||||
) + \
|
||||
" return " + var_name + ";\n}.call\n"
|
||||
return "{" + ", ".join(convert.string2quote(k) + "=>" + _convert(vv) for k, vv in v.items()) + "}"
|
||||
if isinstance(v, list):
|
||||
return "[" + ", ".join(_convert(vv, depth+1) for vv in v) + "]"
|
||||
return "[" + ", ".join(_convert(vv) for vv in v) + "]"
|
||||
|
||||
# { output={}; output["failure_classification"]="intermittent"; yield output; }
|
||||
|
||||
return _convert(convert.json_decoder(self.json), 0)
|
||||
return _convert(convert.json_decoder(self.json))
|
||||
|
||||
def to_python(self):
|
||||
return self.json
|
||||
|
@ -561,6 +732,9 @@ class DocOp(object):
|
|||
|
||||
|
||||
complex_operators = {
|
||||
"not": NotOp,
|
||||
"length": UnaryOp,
|
||||
"number": UnaryOp,
|
||||
"terms": TermsOp,
|
||||
"exists": ExistsOp,
|
||||
"missing": MissingOp,
|
||||
|
|
|
@ -18,7 +18,6 @@ from pyLibrary.dot import listwrap
|
|||
from pyLibrary.queries import windows
|
||||
from pyLibrary.queries.containers.cube import Cube
|
||||
from pyLibrary.queries.domains import SimpleSetDomain, DefaultDomain
|
||||
# from pyLibrary.queries.py.util import util_filter
|
||||
from pyLibrary.queries.expressions import qb_expression_to_function
|
||||
|
||||
|
|
@ -0,0 +1,481 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http:# mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from copy import copy
|
||||
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.env import elasticsearch
|
||||
from pyLibrary.env.elasticsearch import ES_NUMERIC_TYPES
|
||||
from pyLibrary.meta import use_settings
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.domains import NumericDomain, SimpleSetDomain, UniqueDomain
|
||||
from pyLibrary.queries.query import Query
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import coalesce, set_default, Null, literal_field
|
||||
from pyLibrary.dot import wrap
|
||||
from pyLibrary.strings import expand_template
|
||||
from pyLibrary.thread.threads import Queue, Thread, Lock, Till
|
||||
from pyLibrary.times.dates import Date
|
||||
from pyLibrary.times.durations import HOUR, MINUTE
|
||||
|
||||
|
||||
DEBUG = True
|
||||
TOO_OLD = 2*HOUR
|
||||
singlton = None
|
||||
|
||||
|
||||
class FromESMetadata(Container):
|
||||
"""
|
||||
QUERY THE METADATA
|
||||
"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
global singlton
|
||||
if singlton:
|
||||
return singlton
|
||||
else:
|
||||
singlton = object.__new__(cls)
|
||||
return singlton
|
||||
|
||||
@use_settings
|
||||
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
|
||||
if hasattr(self, "settings"):
|
||||
return
|
||||
|
||||
from pyLibrary.queries.containers.lists import ListContainer
|
||||
|
||||
Container.__init__(self, None, schema=self)
|
||||
self.settings = settings
|
||||
self.default_name = coalesce(name, alias, index)
|
||||
self.default_es = elasticsearch.Cluster(settings=settings)
|
||||
self.locker = Lock("")
|
||||
self.todo = Queue("refresh metadata")
|
||||
|
||||
table_columns = metadata_tables()
|
||||
column_columns = metadata_columns()
|
||||
self.tables = ListContainer([], wrap({c.name: c for c in table_columns}))
|
||||
self.columns = ListContainer([], wrap({c.name: c for c in column_columns}))
|
||||
self.columns.insert(column_columns)
|
||||
self.columns.insert(table_columns)
|
||||
self.worker = Thread.run("refresh metadata", self.monitor)
|
||||
return
|
||||
|
||||
@property
|
||||
def query_path(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self.default_es.path + "/" + self.default_name.replace(".", "/")
|
||||
|
||||
def get_table(self, table_name):
|
||||
with self.locker:
|
||||
return self.tables.query({"where": {"eq": {"name": table_name}}})
|
||||
|
||||
def upsert_column(self, c):
|
||||
existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data)
|
||||
if not existing_columns:
|
||||
self.columns.add(c)
|
||||
cols = filter(lambda r: r.table == "meta.columns", self.columns.data)
|
||||
for c in cols:
|
||||
c.partitions = c.cardinality = c.last_updated = None
|
||||
self.todo.add(c)
|
||||
self.todo.extend(cols)
|
||||
else:
|
||||
set_default(existing_columns[0], c)
|
||||
self.todo.add(existing_columns[0])
|
||||
|
||||
def _get_columns(self, table=None):
|
||||
# TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
|
||||
alias_done = set()
|
||||
metadata = self.default_es.get_metadata(index=table)
|
||||
for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
|
||||
for _, properties in meta.mappings.items():
|
||||
columns = elasticsearch.parse_properties(index, None, properties.properties)
|
||||
with self.locker:
|
||||
for c in columns:
|
||||
# ABSOLUTE
|
||||
c.table = index
|
||||
# c.domain = DefaultDomain()
|
||||
self.upsert_column(c)
|
||||
|
||||
for alias in meta.aliases:
|
||||
# ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
|
||||
if alias in alias_done:
|
||||
continue
|
||||
alias_done.add(alias)
|
||||
|
||||
c = copy(c)
|
||||
c.table = alias
|
||||
self.upsert_column(c)
|
||||
|
||||
def query(self, _query):
|
||||
return self.columns.query(Query(set_default(
|
||||
{
|
||||
"from": self.columns,
|
||||
"sort": ["table", "name"]
|
||||
},
|
||||
_query.as_dict()
|
||||
)))
|
||||
|
||||
def get_columns(self, table):
|
||||
"""
|
||||
RETURN METADATA COLUMNS
|
||||
"""
|
||||
with self.locker:
|
||||
columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
|
||||
if columns:
|
||||
return columns
|
||||
|
||||
self._get_columns(table=table)
|
||||
with self.locker:
|
||||
columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
|
||||
if columns:
|
||||
return columns
|
||||
|
||||
# self._get_columns(table=table)
|
||||
Log.error("no columns for {{table}}", table=table)
|
||||
|
||||
def _update_cardinality(self, c):
|
||||
"""
|
||||
QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
|
||||
"""
|
||||
if c.type in ["object", "nested"]:
|
||||
Log.error("not supported")
|
||||
if c.table == "meta.columns":
|
||||
with self.locker:
|
||||
partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"partitions": partitions,
|
||||
"cardinality": len(partitions),
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
|
||||
})
|
||||
return
|
||||
if c.table == "meta.tables":
|
||||
with self.locker:
|
||||
partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"partitions": partitions,
|
||||
"cardinality": len(partitions),
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"where": {"eq": {"table": c.table, "name": c.name}}
|
||||
})
|
||||
return
|
||||
|
||||
result = self.default_es.post("/"+c.table+"/_search", data={
|
||||
"aggs": {c.name: _counting_query(c)},
|
||||
"size": 0
|
||||
})
|
||||
r = result.aggregations.values()[0]
|
||||
cardinaility = coalesce(r.value, r._nested.value)
|
||||
|
||||
query = Dict(size=0)
|
||||
if c.type in ["object", "nested"]:
|
||||
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
|
||||
with self.locker:
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"cardinality": cardinaility,
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"clear": ["partitions"],
|
||||
"where": {"eq": {"table": c.table, "name": c.name}}
|
||||
})
|
||||
return
|
||||
elif c.cardinality > 1000:
|
||||
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
|
||||
with self.locker:
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"cardinality": cardinaility,
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"clear": ["partitions"],
|
||||
"where": {"eq": {"table": c.table, "name": c.name}}
|
||||
})
|
||||
return
|
||||
elif c.type in ES_NUMERIC_TYPES and c.cardinality > 30:
|
||||
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
|
||||
with self.locker:
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"cardinality": cardinaility,
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"clear": ["partitions"],
|
||||
"where": {"eq": {"table": c.table, "name": c.name}}
|
||||
})
|
||||
return
|
||||
elif c.nested_path:
|
||||
query.aggs[literal_field(c.name)] = {
|
||||
"nested": {"path": c.nested_path[0]},
|
||||
"aggs": {"_nested": {"terms": {"field": c.name, "size": 0}}}
|
||||
}
|
||||
else:
|
||||
query.aggs[literal_field(c.name)] = {"terms": {"field": c.name, "size": 0}}
|
||||
|
||||
result = self.default_es.post("/"+c.table+"/_search", data=query)
|
||||
|
||||
aggs = result.aggregations.values()[0]
|
||||
if aggs._nested:
|
||||
parts = qb.sort(aggs._nested.buckets.key)
|
||||
else:
|
||||
parts = qb.sort(aggs.buckets.key)
|
||||
|
||||
Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
|
||||
with self.locker:
|
||||
self.columns.update({
|
||||
"set": {
|
||||
"cardinality": cardinaility,
|
||||
"partitions": parts,
|
||||
"last_updated": Date.now()
|
||||
},
|
||||
"where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
|
||||
})
|
||||
|
||||
def monitor(self, please_stop):
|
||||
while not please_stop:
|
||||
if not self.todo:
|
||||
with self.locker:
|
||||
old_columns = filter(lambda c: (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in ["object", "nested"], self.columns)
|
||||
if old_columns:
|
||||
self.todo.extend(old_columns)
|
||||
else:
|
||||
Log.note("no more metatdata to update")
|
||||
|
||||
column = self.todo.pop(timeout=10*MINUTE)
|
||||
if column:
|
||||
if column.type in ["object", "nested"]:
|
||||
continue
|
||||
if column.last_updated >= Date.now()-TOO_OLD:
|
||||
continue
|
||||
self._update_cardinality(column)
|
||||
Log.note("updated {{column.name}}", column=column)
|
||||
|
||||
|
||||
def _counting_query(c):
|
||||
if c.nested_path:
|
||||
return {
|
||||
"nested": {
|
||||
"path": c.nested_path[0] # FIRST ONE IS LONGEST
|
||||
},
|
||||
"aggs": {
|
||||
"_nested": {"cardinality": {
|
||||
"field": c.name,
|
||||
"precision_threshold": 10 if c.type in ES_NUMERIC_TYPES else 100
|
||||
}}
|
||||
}
|
||||
}
|
||||
else:
|
||||
return {"cardinality": {
|
||||
"field": c.name
|
||||
}}
|
||||
|
||||
|
||||
def metadata_columns():
|
||||
return wrap(
|
||||
[
|
||||
Column(
|
||||
table="meta.columns",
|
||||
name=c,
|
||||
abs_name=c,
|
||||
type="string",
|
||||
nested_path=Null,
|
||||
)
|
||||
for c in [
|
||||
"name",
|
||||
"type",
|
||||
"nested_path",
|
||||
"relative",
|
||||
"abs_name",
|
||||
"table"
|
||||
]
|
||||
] + [
|
||||
Column(
|
||||
table="meta.columns",
|
||||
name=c,
|
||||
abs_name=c,
|
||||
type="object",
|
||||
nested_path=Null,
|
||||
)
|
||||
for c in [
|
||||
"domain",
|
||||
"partitions"
|
||||
]
|
||||
] + [
|
||||
Column(
|
||||
table="meta.columns",
|
||||
name=c,
|
||||
abs_name=c,
|
||||
type="long",
|
||||
nested_path=Null,
|
||||
)
|
||||
for c in [
|
||||
"count",
|
||||
"cardinality"
|
||||
]
|
||||
] + [
|
||||
Column(
|
||||
table="meta.columns",
|
||||
name="last_updated",
|
||||
abs_name="last_updated",
|
||||
type="time",
|
||||
nested_path=Null,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
def metadata_tables():
|
||||
return wrap(
|
||||
[
|
||||
Column(
|
||||
table="meta.tables",
|
||||
name=c,
|
||||
abs_name=c,
|
||||
type="string",
|
||||
nested_path=Null
|
||||
)
|
||||
for c in [
|
||||
"name",
|
||||
"url",
|
||||
"query_path"
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def DataClass(name, columns):
|
||||
"""
|
||||
Each column has {"name", "required", "nulls", "default"} properties
|
||||
"""
|
||||
|
||||
columns = wrap([{"name": c, "required": True, "nulls": False} if isinstance(c, basestring) else c for c in columns])
|
||||
slots = columns.name
|
||||
required = wrap(filter(lambda c: c.required and not c.nulls and not c.default, columns)).name
|
||||
nulls = wrap(filter(lambda c: c.nulls, columns)).name
|
||||
|
||||
code = expand_template("""
|
||||
from __future__ import unicode_literals
|
||||
from collections import Mapping
|
||||
|
||||
class {{name}}(Mapping):
|
||||
__slots__ = {{slots}}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if not kwargs:
|
||||
return
|
||||
|
||||
for s in {{slots}}:
|
||||
setattr(self, s, kwargs.get(s, kwargs.get('default', Null)))
|
||||
|
||||
missed = {{required}}-set(kwargs.keys())
|
||||
if missed:
|
||||
Log.error("Expecting properties {"+"{missed}}", missed=missed)
|
||||
|
||||
illegal = set(kwargs.keys())-set({{slots}})
|
||||
if illegal:
|
||||
Log.error("{"+"{names}} are not a valid properties", names=illegal)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return getattr(self, item)
|
||||
|
||||
def __setitem__(self, item, value):
|
||||
setattr(self, item, value)
|
||||
return self
|
||||
|
||||
def __setattr__(self, item, value):
|
||||
if item not in {{slots}}:
|
||||
Log.error("{"+"{item|quote}} not valid attribute", item=item)
|
||||
object.__setattr__(self, item, value)
|
||||
|
||||
def __getattr__(self, item):
|
||||
Log.error("{"+"{item|quote}} not valid attribute", item=item)
|
||||
|
||||
def items(self):
|
||||
return ((k, getattr(self, k)) for k in {{slots}})
|
||||
|
||||
def __copy__(self):
|
||||
_set = object.__setattr__
|
||||
output = object.__new__(Column)
|
||||
{{assign}}
|
||||
return output
|
||||
|
||||
def __iter__(self):
|
||||
return {{slots}}.__iter__()
|
||||
|
||||
def __len__(self):
|
||||
return {{len_slots}}
|
||||
|
||||
def __str__(self):
|
||||
return str({{dict}})
|
||||
|
||||
temp = {{name}}
|
||||
""",
|
||||
{
|
||||
"name": name,
|
||||
"slots": "(" + (", ".join(convert.value2quote(s) for s in slots)) + ")",
|
||||
"required": "{" + (", ".join(convert.value2quote(s) for s in required)) + "}",
|
||||
"nulls": "{" + (", ".join(convert.value2quote(s) for s in nulls)) + "}",
|
||||
"len_slots": len(slots),
|
||||
"dict": "{" + (", ".join(convert.value2quote(s) + ": self." + s for s in slots)) + "}",
|
||||
"assign": "; ".join("_set(output, "+convert.value2quote(s)+", self."+s+")" for s in slots)
|
||||
}
|
||||
)
|
||||
|
||||
return _exec(code)
|
||||
|
||||
|
||||
def _exec(code):
|
||||
temp = None
|
||||
exec(code)
|
||||
return temp
|
||||
|
||||
|
||||
class Table(DataClass("Table", [
|
||||
"name",
|
||||
"url",
|
||||
"query_path"
|
||||
])):
|
||||
@property
|
||||
def columns(self):
|
||||
return FromESMetadata.singlton.get_columns(table=self.name)
|
||||
|
||||
|
||||
Column = DataClass(
|
||||
"Column",
|
||||
[
|
||||
"name",
|
||||
"abs_name",
|
||||
"table",
|
||||
"type",
|
||||
{"name": "useSource", "default": False},
|
||||
{"name": "nested_path", "nulls": True}, # AN ARRAY OF PATHS (FROM DEEPEST TO SHALLOWEST) INDICATING THE JSON SUB-ARRAYS
|
||||
{"name": "relative", "nulls": True},
|
||||
{"name": "count", "nulls": True},
|
||||
{"name": "cardinality", "nulls": True},
|
||||
{"name": "partitions", "nulls": True},
|
||||
{"name": "last_updated", "nulls": True}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary.dot import set_default, Dict
|
||||
from pyLibrary.queries.query import Query
|
||||
|
||||
|
||||
class Namespace(object):
|
||||
|
||||
def convert(self, expr):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_query(self, query):
|
||||
output = Query()
|
||||
output.select = self._convert_clause(query.select)
|
||||
output.where = self.convert(query.where)
|
||||
output["from"] = self._convert_from(query["from"])
|
||||
output.edges = self._convert_clause(query.edges)
|
||||
output.having = convert_list(self._convert_having, query.having)
|
||||
output.window = convert_list(self._convert_window, query.window)
|
||||
output.sort = self._convert_clause(query.sort)
|
||||
output.format = query.format
|
||||
|
||||
return output
|
||||
|
||||
def _convert_from(self, frum):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_clause(self, clause):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_having(self, clause):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_window(self, clause):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def convert_list(operator, operand):
|
||||
if operand==None:
|
||||
return None
|
||||
elif isinstance(operand, Mapping):
|
||||
return operator(operand)
|
||||
else:
|
||||
return map(operator, operand)
|
||||
|
||||
|
|
@ -0,0 +1,283 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
from copy import copy
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import coalesce, Null
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, listwrap
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.domains import Domain
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER
|
||||
from pyLibrary.queries.namespace import Namespace, convert_list
|
||||
from pyLibrary.queries.query import Query, get_all_vars
|
||||
|
||||
|
||||
DEFAULT_LIMIT = 10
|
||||
|
||||
|
||||
class Normal(Namespace):
|
||||
"""
|
||||
UNREMARKABLE NAMESPACE, SIMPLY FOR CONVERTING QUERY TO NORMAL FORM
|
||||
"""
|
||||
|
||||
def convert(self, expr):
|
||||
if isinstance(expr, Mapping) and expr["from"]:
|
||||
return self._convert_query(expr)
|
||||
return expr
|
||||
|
||||
|
||||
def _convert_query(self, query):
|
||||
# if not isinstance(query["from"], Container):
|
||||
# Log.error('Expecting from clause to be a Container')
|
||||
query = wrap(query)
|
||||
|
||||
output = Query()
|
||||
output["from"] = self._convert_from(query["from"])
|
||||
|
||||
output.format = query.format
|
||||
|
||||
if query.select:
|
||||
output.select = convert_list(self._convert_select, query.select)
|
||||
else:
|
||||
if query.edges or query.groupby:
|
||||
output.select = {"name": "count", "value": ".", "aggregate": "count"}
|
||||
else:
|
||||
output.select = {"name": "__all__", "value": "*", "aggregate": "none"}
|
||||
|
||||
if query.groupby and query.edges:
|
||||
Log.error("You can not use both the `groupby` and `edges` clauses in the same query!")
|
||||
elif query.edges:
|
||||
output.edges = convert_list(self._convert_edge, query.edges)
|
||||
output.groupby = None
|
||||
elif query.groupby:
|
||||
output.edges = None
|
||||
output.groupby = convert_list(self._convert_group, query.groupby)
|
||||
else:
|
||||
output.edges = []
|
||||
output.groupby = None
|
||||
|
||||
output.where = self.convert(query.where)
|
||||
output.window = convert_list(self._convert_window, query.window)
|
||||
output.sort = self._convert_sort(query.sort)
|
||||
|
||||
output.limit = coalesce(query.limit, DEFAULT_LIMIT)
|
||||
if not Math.is_integer(output.limit) or output.limit < 0:
|
||||
Log.error("Expecting limit >= 0")
|
||||
|
||||
output.isLean = query.isLean
|
||||
|
||||
# DEPTH ANALYSIS - LOOK FOR COLUMN REFERENCES THAT MAY BE DEEPER THAN
|
||||
# THE from SOURCE IS.
|
||||
vars = get_all_vars(output, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
|
||||
for c in query.columns:
|
||||
if c.name in vars and c.nested_path:
|
||||
Log.error("This query, with variable {{var_name}} is too deep", var_name=c.name)
|
||||
|
||||
output.having = convert_list(self._convert_having, query.having)
|
||||
|
||||
return output
|
||||
|
||||
def _convert_from(self, frum):
|
||||
if isinstance(frum, basestring):
|
||||
return Dict(name=frum)
|
||||
elif isinstance(frum, (Container, Query)):
|
||||
return frum
|
||||
else:
|
||||
Log.error("Expecting from clause to be a name, or a container")
|
||||
|
||||
def _convert_select(self, select):
|
||||
if isinstance(select, basestring):
|
||||
return Dict(
|
||||
name=select.rstrip("."), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
|
||||
value=select,
|
||||
aggregate="none"
|
||||
)
|
||||
else:
|
||||
select = wrap(select)
|
||||
output = copy(select)
|
||||
if not select.value or isinstance(select.value, basestring):
|
||||
if select.value == ".":
|
||||
output.name = coalesce(select.name, select.aggregate)
|
||||
else:
|
||||
output.name = coalesce(select.name, select.value, select.aggregate)
|
||||
elif not output.name:
|
||||
Log.error("Must give name to each column in select clause")
|
||||
|
||||
if not output.name:
|
||||
Log.error("expecting select to have a name: {{select}}", select=select)
|
||||
|
||||
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
|
||||
return output
|
||||
|
||||
def _convert_edge(self, edge):
|
||||
if isinstance(edge, basestring):
|
||||
return Dict(
|
||||
name=edge,
|
||||
value=edge,
|
||||
domain=self._convert_domain()
|
||||
)
|
||||
else:
|
||||
edge = wrap(edge)
|
||||
if not edge.name and not isinstance(edge.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= edge)
|
||||
|
||||
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
|
||||
# COMPLEX EDGE IS SHORT HAND
|
||||
domain =self._convert_domain()
|
||||
domain.dimension = Dict(fields=edge.value)
|
||||
|
||||
return Dict(
|
||||
name=edge.name,
|
||||
allowNulls=False if edge.allowNulls is False else True,
|
||||
domain=domain
|
||||
)
|
||||
|
||||
domain = self._convert_domain(edge.domain)
|
||||
return Dict(
|
||||
name=coalesce(edge.name, edge.value),
|
||||
value=edge.value,
|
||||
range=edge.range,
|
||||
allowNulls=False if edge.allowNulls is False else True,
|
||||
domain=domain
|
||||
)
|
||||
|
||||
def _convert_group(self, column):
|
||||
if isinstance(column, basestring):
|
||||
return wrap({
|
||||
"name": column,
|
||||
"value": column,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
else:
|
||||
column = wrap(column)
|
||||
if (column.domain and column.domain.type != "default") or column.allowNulls != None:
|
||||
Log.error("groupby does not accept complicated domains")
|
||||
|
||||
if not column.name and not isinstance(column.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= column)
|
||||
|
||||
return wrap({
|
||||
"name": coalesce(column.name, column.value),
|
||||
"value": column.value,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
|
||||
|
||||
def _convert_domain(self, domain=None):
|
||||
if not domain:
|
||||
return Domain(type="default")
|
||||
elif isinstance(domain, Dimension):
|
||||
return domain.getDomain()
|
||||
elif isinstance(domain, Domain):
|
||||
return domain
|
||||
|
||||
if not domain.name:
|
||||
domain = domain.copy()
|
||||
domain.name = domain.type
|
||||
|
||||
if not isinstance(domain.partitions, list):
|
||||
domain.partitions = list(domain.partitions)
|
||||
|
||||
return Domain(**domain)
|
||||
|
||||
def _convert_range(self, range):
|
||||
if range == None:
|
||||
return None
|
||||
|
||||
return Dict(
|
||||
min=range.min,
|
||||
max=range.max
|
||||
)
|
||||
|
||||
def _convert_where(self, where):
|
||||
if where == None:
|
||||
return TRUE_FILTER
|
||||
return where
|
||||
|
||||
|
||||
def _convert_window(self, window):
|
||||
return Dict(
|
||||
name=coalesce(window.name, window.value),
|
||||
value=window.value,
|
||||
edges=[self._convert_edge(e) for e in listwrap(window.edges)],
|
||||
sort=self._convert_sort(window.sort),
|
||||
aggregate=window.aggregate,
|
||||
range=self._convert_range(window.range),
|
||||
where=self._convert_where(window.where)
|
||||
)
|
||||
|
||||
|
||||
def _convert_sort(self, sort):
|
||||
return normalize_sort(sort)
|
||||
|
||||
|
||||
def normalize_sort(sort=None):
|
||||
"""
|
||||
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
|
||||
"""
|
||||
|
||||
if not sort:
|
||||
return DictList.EMPTY
|
||||
|
||||
output = DictList()
|
||||
for s in listwrap(sort):
|
||||
if isinstance(s, basestring) or Math.is_integer(s):
|
||||
output.append({"value": s, "sort": 1})
|
||||
elif not s.field and not s.value and s.sort==None:
|
||||
#ASSUME {name: sort} FORM
|
||||
for n, v in s.items():
|
||||
output.append({"value": n, "sort": sort_direction[v]})
|
||||
else:
|
||||
output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)})
|
||||
return wrap(output)
|
||||
|
||||
|
||||
sort_direction = {
|
||||
"asc": 1,
|
||||
"desc": -1,
|
||||
"none": 0,
|
||||
1: 1,
|
||||
0: 0,
|
||||
-1: -1,
|
||||
None: 1,
|
||||
Null: 1
|
||||
}
|
||||
|
||||
canonical_aggregates = {
|
||||
"none": "none",
|
||||
"one": "one",
|
||||
"count": "count",
|
||||
"sum": "sum",
|
||||
"add": "sum",
|
||||
"mean": "average",
|
||||
"average": "average",
|
||||
"avg": "average",
|
||||
"min": "minimum",
|
||||
"minimum": "minimum",
|
||||
"max": "maximum",
|
||||
"maximum": "minimum",
|
||||
"X2": "sum_of_squares",
|
||||
"std": "std",
|
||||
"stddev": "std",
|
||||
"std_deviation": "std",
|
||||
"var": "variance",
|
||||
"variance": "variance",
|
||||
"stats": "stats"
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
from copy import copy
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import set_default, wrap, coalesce, Dict, listwrap, unwraplist
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.namespace import Namespace, convert_list
|
||||
from pyLibrary.queries.query import Query
|
||||
from pyLibrary.times.dates import Date
|
||||
|
||||
|
||||
class Rename(Namespace):
|
||||
|
||||
def __init__(self, dimensions, source):
|
||||
"""
|
||||
EXPECTING A LIST OF {"name":name, "value":value} OBJECTS TO PERFORM A MAPPING
|
||||
"""
|
||||
dimensions = wrap(dimensions)
|
||||
if isinstance(dimensions, Mapping) and dimensions.name == None:
|
||||
# CONVERT TO A REAL DIMENSION DEFINITION
|
||||
dimensions = {"name": ".", "type": "set", "edges":[{"name": k, "field": v} for k, v in dimensions.items()]}
|
||||
|
||||
self.dimensions = Dimension(dimensions, None, source)
|
||||
|
||||
def convert(self, expr):
|
||||
"""
|
||||
EXPAND INSTANCES OF name TO value
|
||||
"""
|
||||
if expr is True or expr == None or expr is False:
|
||||
return expr
|
||||
elif Math.is_number(expr):
|
||||
return expr
|
||||
elif expr == ".":
|
||||
return "."
|
||||
elif is_keyword(expr):
|
||||
return coalesce(self.dimensions[expr], expr)
|
||||
elif isinstance(expr, basestring):
|
||||
Log.error("{{name|quote}} is not a valid variable name", name=expr)
|
||||
elif isinstance(expr, Date):
|
||||
return expr
|
||||
elif isinstance(expr, Query):
|
||||
return self._convert_query(expr)
|
||||
elif isinstance(expr, Mapping):
|
||||
if expr["from"]:
|
||||
return self._convert_query(expr)
|
||||
elif len(expr) >= 2:
|
||||
#ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
|
||||
return wrap({name: self.convert(value) for name, value in expr.leaves()})
|
||||
else:
|
||||
# ASSUME SINGLE-CLAUSE EXPRESSION
|
||||
k, v = expr.items()[0]
|
||||
return converter_map.get(k, self._convert_bop)(self, k, v)
|
||||
elif isinstance(expr, (list, set, tuple)):
|
||||
return wrap([self.convert(value) for value in expr])
|
||||
else:
|
||||
return expr
|
||||
|
||||
def _convert_query(self, query):
|
||||
output = Query(None)
|
||||
output.select = self._convert_clause(query.select)
|
||||
output.where = self.convert(query.where)
|
||||
output.frum = self._convert_from(query.frum)
|
||||
output.edges = convert_list(self._convert_edge, query.edges)
|
||||
output.having = convert_list(self._convert_having, query.having)
|
||||
output.window = convert_list(self._convert_window, query.window)
|
||||
output.sort = self._convert_clause(query.sort)
|
||||
output.format = query.format
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
|
||||
def _convert_bop(self, op, term):
|
||||
if isinstance(term, list):
|
||||
return {op: map(self.convert, term)}
|
||||
|
||||
return {op: {self.convert(var): val for var, val in term.items()}}
|
||||
|
||||
def _convert_many(self, k, v):
|
||||
return {k: map(self.convert, v)}
|
||||
|
||||
def _convert_from(self, frum):
|
||||
if isinstance(frum, Mapping):
|
||||
return Dict(name=self.convert(frum.name))
|
||||
else:
|
||||
return self.convert(frum)
|
||||
|
||||
def _convert_edge(self, edge):
|
||||
dim = self.dimensions[edge.value]
|
||||
if not dim:
|
||||
return edge
|
||||
|
||||
if len(listwrap(dim.fields)) == 1:
|
||||
#TODO: CHECK IF EDGE DOMAIN AND DIMENSION DOMAIN CONFLICT
|
||||
new_edge = set_default({"value": unwraplist(dim.fields)}, edge)
|
||||
return new_edge
|
||||
new_edge.domain = dim.getDomain()
|
||||
|
||||
edge = copy(edge)
|
||||
edge.value = None
|
||||
edge.domain = dim.getDomain()
|
||||
return edge
|
||||
|
||||
def _convert_clause(self, clause):
|
||||
"""
|
||||
Qb QUERIES HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS
|
||||
"""
|
||||
clause = wrap(clause)
|
||||
|
||||
if clause == None:
|
||||
return None
|
||||
elif isinstance(clause, Mapping):
|
||||
return set_default({"value": self.convert(clause.value)}, clause)
|
||||
else:
|
||||
return [set_default({"value": self.convert(c.value)}, c) for c in clause]
|
||||
|
||||
converter_map = {
|
||||
"and": Rename._convert_many,
|
||||
"or": Rename._convert_many,
|
||||
"not": Rename.convert,
|
||||
"missing": Rename.convert,
|
||||
"exists": Rename.convert
|
||||
}
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import set_default, wrap, Dict, Null
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.namespace import convert_list, Namespace
|
||||
from pyLibrary.queries.query import Query
|
||||
from pyLibrary.times.dates import Date
|
||||
|
||||
|
||||
class Typed(Namespace):
|
||||
|
||||
def __init__(self):
|
||||
self.converter_map = {
|
||||
"and": self._convert_many,
|
||||
"or": self._convert_many,
|
||||
"not": self.convert,
|
||||
"missing": self.convert,
|
||||
"exists": self.convert
|
||||
}
|
||||
|
||||
def convert(self, expr):
|
||||
"""
|
||||
ADD THE ".$value" SUFFIX TO ALL VARIABLES
|
||||
"""
|
||||
if expr is True or expr == None or expr is False:
|
||||
return expr
|
||||
elif Math.is_number(expr):
|
||||
return expr
|
||||
elif expr == ".":
|
||||
return "."
|
||||
elif is_keyword(expr):
|
||||
#TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX
|
||||
return expr + ".$value"
|
||||
elif isinstance(expr, basestring):
|
||||
Log.error("{{name|quote}} is not a valid variable name", name=expr)
|
||||
elif isinstance(expr, Date):
|
||||
return expr
|
||||
elif isinstance(expr, Query):
|
||||
return self._convert_query(expr)
|
||||
elif isinstance(expr, Mapping):
|
||||
if expr["from"]:
|
||||
return self._convert_query(expr)
|
||||
elif len(expr) >= 2:
|
||||
#ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
|
||||
return wrap({name: self.convert(value) for name, value in expr.items()})
|
||||
else:
|
||||
# ASSUME SINGLE-CLAUSE EXPRESSION
|
||||
k, v = expr.items()[0]
|
||||
return self.converter_map.get(k, self._convert_bop)(k, v)
|
||||
elif isinstance(expr, (list, set, tuple)):
|
||||
return wrap([self.convert(value) for value in expr])
|
||||
|
||||
def _convert_query(self, query):
|
||||
output = Query(Null)
|
||||
output.select = self._convert_clause(query.select)
|
||||
output.where = self.convert(query.where)
|
||||
output.frum = self._convert_from(query.frum)
|
||||
output.edges = self._convert_clause(query.edges)
|
||||
output.groupby = self._convert_clause(query.groupby)
|
||||
output.window = convert_list(self._convert_window, query.window)
|
||||
output.having = convert_list(self._convert_having, query.having)
|
||||
output.sort = self._convert_clause(query.sort)
|
||||
output.limit = query.limit
|
||||
output.format = query.format
|
||||
|
||||
return output
|
||||
|
||||
def _convert_clause(self, clause):
|
||||
"""
|
||||
Qb QUERIES HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS
|
||||
"""
|
||||
if clause == None:
|
||||
return None
|
||||
elif isinstance(clause, Mapping):
|
||||
return set_default({"value": self.convert(clause["value"])}, clause)
|
||||
else:
|
||||
return [set_default({"value": self.convert(c.value)}, c) for c in clause]
|
||||
|
||||
def _convert_from(self, frum):
|
||||
return frum
|
||||
|
||||
def _convert_having(self, having):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_window(self, window):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _convert_many(self, k, v):
|
||||
return {k: map(self.convert, v)}
|
||||
|
||||
def _convert_bop(self, op, term):
|
||||
if isinstance(term, list):
|
||||
return {op: map(self.convert, term)}
|
||||
|
||||
return {op: {var: val for var, val in term.items()}}
|
||||
|
|
@ -1,424 +0,0 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary.collections import AND, reverse
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import coalesce, split_field, join_field, Null
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, unwrap, listwrap
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.domains import Domain, is_keyword
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER, simplify_esfilter
|
||||
|
||||
|
||||
DEFAULT_LIMIT = 10
|
||||
|
||||
_qb = None
|
||||
_INDEX_CACHE = None
|
||||
|
||||
|
||||
def _late_import():
|
||||
global _qb
|
||||
global _INDEX_CACHE
|
||||
|
||||
from pyLibrary.queries import qb as _qb
|
||||
from pyLibrary.queries.es09.util import INDEX_CACHE as _INDEX_CACHE
|
||||
|
||||
_ = _qb
|
||||
_ = _INDEX_CACHE
|
||||
|
||||
|
||||
def _normalize_selects(selects, schema=None):
|
||||
if isinstance(selects, list):
|
||||
output = wrap([_normalize_select(s, schema=schema) for s in selects])
|
||||
|
||||
exists = set()
|
||||
for s in output:
|
||||
if s.name in exists:
|
||||
Log.error("{{name}} has already been defined", name= s.name)
|
||||
exists.add(s.name)
|
||||
return output
|
||||
else:
|
||||
return _normalize_select(selects, schema=schema)
|
||||
|
||||
|
||||
def _normalize_select(select, schema=None):
|
||||
if isinstance(select, basestring):
|
||||
if schema:
|
||||
s = schema[select]
|
||||
if s:
|
||||
return s.getSelect()
|
||||
return Dict(
|
||||
name=select.rstrip("."), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
|
||||
value=select,
|
||||
aggregate="none"
|
||||
)
|
||||
else:
|
||||
select = wrap(select)
|
||||
output = select.copy()
|
||||
if not select.value or isinstance(select.value, basestring):
|
||||
output.name = coalesce(select.name, select.value, select.aggregate)
|
||||
elif not output.name:
|
||||
Log.error("Must give name to each column in select clause")
|
||||
|
||||
if not output.name:
|
||||
Log.error("expecting select to have a name: {{select}}", select=select)
|
||||
|
||||
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
|
||||
return output
|
||||
|
||||
|
||||
def _normalize_edges(edges, schema=None):
|
||||
return [_normalize_edge(e, schema=schema) for e in listwrap(edges)]
|
||||
|
||||
|
||||
def _normalize_edge(edge, schema=None):
|
||||
if isinstance(edge, basestring):
|
||||
if schema:
|
||||
e = schema[edge]
|
||||
if e:
|
||||
if isinstance(e.fields, list) and len(e.fields) == 1:
|
||||
return Dict(
|
||||
name=e.name,
|
||||
value=e.fields[0],
|
||||
domain=e.getDomain()
|
||||
)
|
||||
else:
|
||||
return Dict(
|
||||
name=e.name,
|
||||
domain=e.getDomain()
|
||||
)
|
||||
return Dict(
|
||||
name=edge,
|
||||
value=edge,
|
||||
domain=_normalize_domain(schema=schema)
|
||||
)
|
||||
else:
|
||||
edge = wrap(edge)
|
||||
if not edge.name and not isinstance(edge.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= edge)
|
||||
|
||||
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
|
||||
# COMPLEX EDGE IS SHORT HAND
|
||||
domain = _normalize_domain(schema=schema)
|
||||
domain.dimension = Dict(fields=edge.value)
|
||||
|
||||
return Dict(
|
||||
name=edge.name,
|
||||
allowNulls=False if edge.allowNulls is False else True,
|
||||
domain=domain
|
||||
)
|
||||
|
||||
domain = _normalize_domain(edge.domain, schema=schema)
|
||||
return Dict(
|
||||
name=coalesce(edge.name, edge.value),
|
||||
value=edge.value,
|
||||
range=edge.range,
|
||||
allowNulls=False if edge.allowNulls is False else True,
|
||||
domain=domain
|
||||
)
|
||||
|
||||
|
||||
def _normalize_groupby(groupby, schema=None):
|
||||
if groupby == None:
|
||||
return None
|
||||
return [_normalize_group(e, schema=schema) for e in listwrap(groupby)]
|
||||
|
||||
|
||||
def _normalize_group(edge, schema=None):
|
||||
if isinstance(edge, basestring):
|
||||
return wrap({
|
||||
"name": edge,
|
||||
"value": edge,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
else:
|
||||
edge = wrap(edge)
|
||||
if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
|
||||
Log.error("groupby does not accept complicated domains")
|
||||
|
||||
if not edge.name and not isinstance(edge.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= edge)
|
||||
|
||||
return wrap({
|
||||
"name": coalesce(edge.name, edge.value),
|
||||
"value": edge.value,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
|
||||
|
||||
def _normalize_domain(domain=None, schema=None):
|
||||
if not domain:
|
||||
return Domain(type="default")
|
||||
elif isinstance(domain, Dimension):
|
||||
return domain.getDomain()
|
||||
elif schema and isinstance(domain, basestring) and schema[domain]:
|
||||
return schema[domain].getDomain()
|
||||
elif isinstance(domain, Domain):
|
||||
return domain
|
||||
|
||||
if not domain.name:
|
||||
domain = domain.copy()
|
||||
domain.name = domain.type
|
||||
|
||||
if not isinstance(domain.partitions, list):
|
||||
domain.partitions = list(domain.partitions)
|
||||
|
||||
return Domain(**domain)
|
||||
|
||||
|
||||
def _normalize_range(range):
|
||||
if range == None:
|
||||
return None
|
||||
|
||||
return Dict(
|
||||
min=range.min,
|
||||
max=range.max
|
||||
)
|
||||
|
||||
|
||||
def _normalize_where(where, schema=None):
|
||||
if where == None:
|
||||
return TRUE_FILTER
|
||||
if schema == None:
|
||||
return where
|
||||
where = simplify_esfilter(_where_terms(where, where, schema))
|
||||
return where
|
||||
|
||||
|
||||
|
||||
def _normalize_window(window, schema=None):
|
||||
return Dict(
|
||||
name=coalesce(window.name, window.value),
|
||||
value=window.value,
|
||||
edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)],
|
||||
sort=_normalize_sort(window.sort),
|
||||
aggregate=window.aggregate,
|
||||
range=_normalize_range(window.range),
|
||||
where=_normalize_where(window.where, schema=schema)
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
def _map_term_using_schema(master, path, term, schema_edges):
|
||||
"""
|
||||
IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM
|
||||
"""
|
||||
output = DictList()
|
||||
for k, v in term.items():
|
||||
dimension = schema_edges[k]
|
||||
if isinstance(dimension, Dimension):
|
||||
domain = dimension.getDomain()
|
||||
if dimension.fields:
|
||||
if isinstance(dimension.fields, Mapping):
|
||||
# EXPECTING A TUPLE
|
||||
for local_field, es_field in dimension.fields.items():
|
||||
local_value = v[local_field]
|
||||
if local_value == None:
|
||||
output.append({"missing": {"field": es_field}})
|
||||
else:
|
||||
output.append({"term": {es_field: local_value}})
|
||||
continue
|
||||
|
||||
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
|
||||
# SIMPLE SINGLE-VALUED FIELD
|
||||
if domain.getPartByKey(v) is domain.NULL:
|
||||
output.append({"missing": {"field": dimension.fields[0]}})
|
||||
else:
|
||||
output.append({"term": {dimension.fields[0]: v}})
|
||||
continue
|
||||
|
||||
if AND(is_keyword(f) for f in dimension.fields):
|
||||
# EXPECTING A TUPLE
|
||||
if not isinstance(v, tuple):
|
||||
Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v)
|
||||
for i, f in enumerate(dimension.fields):
|
||||
vv = v[i]
|
||||
if vv == None:
|
||||
output.append({"missing": {"field": f}})
|
||||
else:
|
||||
output.append({"term": {f: vv}})
|
||||
continue
|
||||
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
|
||||
if domain.getPartByKey(v) is domain.NULL:
|
||||
output.append({"missing": {"field": dimension.fields[0]}})
|
||||
else:
|
||||
output.append({"term": {dimension.fields[0]: v}})
|
||||
continue
|
||||
if domain.partitions:
|
||||
part = domain.getPartByKey(v)
|
||||
if part is domain.NULL or not part.esfilter:
|
||||
Log.error("not expected to get NULL")
|
||||
output.append(part.esfilter)
|
||||
continue
|
||||
else:
|
||||
Log.error("not expected")
|
||||
elif isinstance(v, Mapping):
|
||||
sub = _map_term_using_schema(master, path + [k], v, schema_edges[k])
|
||||
output.append(sub)
|
||||
continue
|
||||
|
||||
output.append({"term": {k: v}})
|
||||
return {"and": output}
|
||||
|
||||
|
||||
def _move_nested_term(master, where, schema):
|
||||
"""
|
||||
THE WHERE CLAUSE CAN CONTAIN NESTED PROPERTY REFERENCES, THESE MUST BE MOVED
|
||||
TO A NESTED FILTER
|
||||
"""
|
||||
items = where.term.items()
|
||||
if len(items) != 1:
|
||||
Log.error("Expecting only one term")
|
||||
k, v = items[0]
|
||||
nested_path = _get_nested_path(k, schema)
|
||||
if nested_path:
|
||||
return {"nested": {
|
||||
"path": nested_path,
|
||||
"query": {"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": {"and": [
|
||||
{"term": {k: v}}
|
||||
]}
|
||||
}}
|
||||
}}
|
||||
return where
|
||||
|
||||
|
||||
def _get_nested_path(field, schema):
|
||||
if not _INDEX_CACHE:
|
||||
_late_import()
|
||||
|
||||
if is_keyword(field):
|
||||
field = join_field([schema.es.alias] + split_field(field))
|
||||
for i, f in reverse(enumerate(split_field(field))):
|
||||
path = join_field(split_field(field)[0:i + 1:])
|
||||
if path in _INDEX_CACHE:
|
||||
return join_field(split_field(path)[1::])
|
||||
return None
|
||||
|
||||
|
||||
def _where_terms(master, where, schema):
|
||||
"""
|
||||
USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
|
||||
master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
|
||||
"""
|
||||
if isinstance(where, Mapping):
|
||||
if where.term:
|
||||
# MAP TERM
|
||||
try:
|
||||
output = _map_term_using_schema(master, [], where.term, schema.edges)
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("programmer problem?", e)
|
||||
elif where.terms:
|
||||
# MAP TERM
|
||||
output = DictList()
|
||||
for k, v in where.terms.items():
|
||||
if not isinstance(v, (list, set)):
|
||||
Log.error("terms filter expects list of values")
|
||||
edge = schema.edges[k]
|
||||
if not edge:
|
||||
output.append({"terms": {k: v}})
|
||||
else:
|
||||
if isinstance(edge, basestring):
|
||||
# DIRECT FIELD REFERENCE
|
||||
return {"terms": {edge: v}}
|
||||
try:
|
||||
domain = edge.getDomain()
|
||||
except Exception, e:
|
||||
Log.error("programmer error", e)
|
||||
fields = domain.dimension.fields
|
||||
if isinstance(fields, Mapping):
|
||||
or_agg = []
|
||||
for vv in v:
|
||||
and_agg = []
|
||||
for local_field, es_field in fields.items():
|
||||
vvv = vv[local_field]
|
||||
if vvv != None:
|
||||
and_agg.append({"term": {es_field: vvv}})
|
||||
or_agg.append({"and": and_agg})
|
||||
output.append({"or": or_agg})
|
||||
elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]):
|
||||
output.append({"terms": {fields[0]: v}})
|
||||
elif domain.partitions:
|
||||
output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]})
|
||||
return {"and": output}
|
||||
elif where["or"]:
|
||||
return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]}
|
||||
elif where["and"]:
|
||||
return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]}
|
||||
elif where["not"]:
|
||||
return {"not": unwrap(_where_terms(master, where["not"], schema))}
|
||||
return where
|
||||
|
||||
|
||||
def _normalize_sort(sort=None):
|
||||
"""
|
||||
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
|
||||
"""
|
||||
|
||||
if not sort:
|
||||
return DictList.EMPTY
|
||||
|
||||
output = DictList()
|
||||
for s in listwrap(sort):
|
||||
if isinstance(s, basestring) or Math.is_integer(s):
|
||||
output.append({"field": s, "sort": 1})
|
||||
elif not s.field and not s.value and s.sort==None:
|
||||
#ASSUME {name: sort} FORM
|
||||
for n, v in s.items():
|
||||
output.append({"field": n, "sort": sort_direction[v]})
|
||||
else:
|
||||
output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)})
|
||||
return wrap(output)
|
||||
|
||||
|
||||
sort_direction = {
|
||||
"asc": 1,
|
||||
"desc": -1,
|
||||
"none": 0,
|
||||
1: 1,
|
||||
0: 0,
|
||||
-1: -1,
|
||||
None: 1,
|
||||
Null: 1
|
||||
}
|
||||
|
||||
canonical_aggregates = {
|
||||
"none": "none",
|
||||
"one": "one",
|
||||
"count": "value_count",
|
||||
"sum": "sum",
|
||||
"add": "sum",
|
||||
"mean": "average",
|
||||
"average": "average",
|
||||
"avg": "average",
|
||||
"min": "minimum",
|
||||
"minimum": "minimum",
|
||||
"max": "maximum",
|
||||
"maximum": "minimum",
|
||||
"X2": "sum_of_squares",
|
||||
"std": "std",
|
||||
"stddev": "std",
|
||||
"std_deviation": "std",
|
||||
"var": "variance",
|
||||
"variance": "variance",
|
||||
"stats": "stats"
|
||||
}
|
||||
|
|
@ -21,34 +21,82 @@ from pyLibrary.debugs.logs import Log
|
|||
from pyLibrary.dot import set_default, Null, Dict, split_field, coalesce, join_field
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import listwrap, wrap, unwrap
|
||||
from pyLibrary.dot.objects import DictObject
|
||||
from pyLibrary.dot.objects import DictClass, DictObject
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import flat_list, query, group_by
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.cubes.aggs import cube_aggs
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER, FALSE_FILTER, compile_expression, qb_expression_to_function
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER, FALSE_FILTER, compile_expression, qb_expression_to_function, qb_expression_to_python
|
||||
from pyLibrary.queries.flat_list import FlatList
|
||||
from pyLibrary.queries.index import Index
|
||||
from pyLibrary.queries.query import Query, _normalize_selects, sort_direction, _normalize_select
|
||||
from pyLibrary.queries.containers.cube import Cube
|
||||
from pyLibrary.queries.normalize import _normalize_sort, _normalize_select, _normalize_selects
|
||||
from pyLibrary.queries.query import Query
|
||||
from pyLibrary.queries.unique_index import UniqueIndex
|
||||
|
||||
|
||||
# A COLLECTION OF DATABASE OPERATORS (RELATIONAL ALGEBRA OPERATORS)
|
||||
# qb QUERY DOCUMENTATION: https://github.com/klahnakoski/qb/tree/master/docs
|
||||
# START HERE: https://github.com/klahnakoski/qb/blob/master/docs/Qb_Reference.md
|
||||
# TODO: USE http://docs.sqlalchemy.org/en/latest/core/tutorial.html AS DOCUMENTATION FRAMEWORK
|
||||
|
||||
|
||||
def run(query):
|
||||
def run(query, frum=None):
|
||||
"""
|
||||
THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER,
|
||||
BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer
|
||||
"""
|
||||
frum = Container.new_instance(query["from"])
|
||||
q = Query(query, frum)
|
||||
return frum.query(q)
|
||||
query = Query(query)
|
||||
frum = coalesce(frum, query["from"])
|
||||
if isinstance(frum, Container):
|
||||
return frum.query(query)
|
||||
elif isinstance(frum, (list, set, GeneratorType)):
|
||||
frum = wrap(list(frum))
|
||||
elif isinstance(frum, Cube):
|
||||
if is_aggs(query):
|
||||
return cube_aggs(frum, query)
|
||||
|
||||
elif isinstance(frum, Query):
|
||||
frum = run(frum).data
|
||||
else:
|
||||
Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
|
||||
|
||||
if is_aggs(query):
|
||||
frum = list_aggs(frum, query)
|
||||
else: # SETOP
|
||||
# try:
|
||||
# if query.filter != None or query.esfilter != None:
|
||||
# Log.error("use 'where' clause")
|
||||
# except AttributeError:
|
||||
# pass
|
||||
|
||||
if query.where is not TRUE_FILTER:
|
||||
frum = filter(frum, query.where)
|
||||
|
||||
if query.sort:
|
||||
frum = sort(frum, query.sort)
|
||||
|
||||
if query.select:
|
||||
frum = select(frum, query.select)
|
||||
|
||||
if query.window:
|
||||
if isinstance(frum, Cube):
|
||||
frum = list(frum.values())
|
||||
|
||||
for param in query.window:
|
||||
window(frum, param)
|
||||
|
||||
# AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT
|
||||
if query.format == "cube":
|
||||
frum = convert.list2cube(frum)
|
||||
elif query.format == "table":
|
||||
frum = convert.list2table(frum)
|
||||
frum.meta.format = "table"
|
||||
else:
|
||||
frum = wrap({
|
||||
"meta": {"format": "list"},
|
||||
"data": frum
|
||||
})
|
||||
|
||||
return frum
|
||||
|
||||
|
||||
groupby = group_by.groupby
|
||||
|
@ -221,12 +269,13 @@ def select_one(record, selection):
|
|||
output = Dict()
|
||||
for f in selection:
|
||||
f = _normalize_select(f)
|
||||
output[f.name]=record[f.value]
|
||||
output[f.name] = record[f.value]
|
||||
return output
|
||||
else:
|
||||
Log.error("Do not know how to handle")
|
||||
|
||||
|
||||
|
||||
def select(data, field_name):
|
||||
"""
|
||||
return list with values from field_name
|
||||
|
@ -395,8 +444,8 @@ def _select_deep_meta(field, depth):
|
|||
return assign
|
||||
|
||||
|
||||
# def get_columns(data):
|
||||
# return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
|
||||
def get_columns(data):
|
||||
return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
|
||||
|
||||
|
||||
def sort(data, fieldnames=None):
|
||||
|
@ -410,19 +459,29 @@ def sort(data, fieldnames=None):
|
|||
if fieldnames == None:
|
||||
return wrap(sorted(data))
|
||||
|
||||
fieldnames = _normalize_sort(fieldnames)
|
||||
fieldnames = listwrap(fieldnames)
|
||||
if len(fieldnames) == 1:
|
||||
fieldnames = fieldnames[0]
|
||||
|
||||
# SPECIAL CASE, ONLY ONE FIELD TO SORT BY
|
||||
if fieldnames.field == ".":
|
||||
if fieldnames == ".":
|
||||
return wrap(sorted(data))
|
||||
if isinstance(fieldnames, (basestring, int)):
|
||||
fieldnames = wrap({"value": fieldnames, "sort": 1})
|
||||
|
||||
# EXPECTING {"field":f, "sort":i} FORMAT
|
||||
fieldnames.sort = sort_direction.get(fieldnames.sort, 1)
|
||||
fieldnames.value = coalesce(fieldnames.value, fieldnames.field)
|
||||
if fieldnames.value==None:
|
||||
Log.error("Expecting sort to have 'value' attribute")
|
||||
|
||||
if fieldnames.value == ".":
|
||||
#VALUE COMPARE
|
||||
def _compare_v(l, r):
|
||||
return value_compare(l, r, fieldnames.sort)
|
||||
return DictList([unwrap(d) for d in sorted(data, cmp=_compare_v)])
|
||||
else:
|
||||
def _compare_o(left, right):
|
||||
return value_compare(coalesce(left)[fieldnames.field], coalesce(right)[fieldnames.field], fieldnames.sort)
|
||||
return value_compare(coalesce(left)[fieldnames.value], coalesce(right)[fieldnames.value], fieldnames.sort)
|
||||
return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)])
|
||||
|
||||
formal = query._normalize_sort(fieldnames)
|
||||
|
@ -432,7 +491,7 @@ def sort(data, fieldnames=None):
|
|||
right = coalesce(right)
|
||||
for f in formal:
|
||||
try:
|
||||
result = value_compare(left[f.field], right[f.field], f.sort)
|
||||
result = value_compare(left[f.value], right[f.value], f.sort)
|
||||
if result != 0:
|
||||
return result
|
||||
except Exception, e:
|
||||
|
@ -449,7 +508,7 @@ def sort(data, fieldnames=None):
|
|||
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("Problem sorting\n{{data}}", data= data, cause=e)
|
||||
Log.error("Problem sorting\n{{data}}", data=data, cause=e)
|
||||
|
||||
|
||||
def value_compare(l, r, ordering=1):
|
||||
|
@ -491,9 +550,13 @@ def filter(data, where):
|
|||
if isinstance(data, Cube):
|
||||
data.filter(where)
|
||||
|
||||
temp = None
|
||||
exec("def temp(row):\n return "+qb_expression_to_python(where))
|
||||
return data.filter(temp)
|
||||
|
||||
try:
|
||||
return drill_filter(where, data)
|
||||
except Exception, e:
|
||||
except Exception, _:
|
||||
# WOW! THIS IS INEFFICIENT!
|
||||
return wrap([unwrap(d) for d in drill_filter(where, [DictObject(d) for d in data])])
|
||||
|
||||
|
@ -516,7 +579,10 @@ def drill_filter(esfilter, data):
|
|||
col = split_field(fieldname)
|
||||
d = data
|
||||
for i, c in enumerate(col):
|
||||
try:
|
||||
d = d[c]
|
||||
except Exception, e:
|
||||
Log.error("{{name}} does not exist", name=fieldname)
|
||||
if isinstance(d, list) and len(col) > 1:
|
||||
if len(primary_column) <= depth+i:
|
||||
primary_nested.append(True)
|
||||
|
@ -581,10 +647,11 @@ def drill_filter(esfilter, data):
|
|||
return True
|
||||
else:
|
||||
return {"not": f}
|
||||
elif filter.term:
|
||||
elif filter.term or filter.eq:
|
||||
eq = coalesce(filter.term, filter.eq)
|
||||
result = True
|
||||
output = {}
|
||||
for col, val in filter["term"].items():
|
||||
for col, val in eq.items():
|
||||
first, rest = parse_field(col, data, depth)
|
||||
d = data[first]
|
||||
if not rest:
|
||||
|
@ -896,4 +963,4 @@ def reverse(vals):
|
|||
|
||||
return wrap(output)
|
||||
|
||||
from pyLibrary.queries.list.aggs import is_aggs, list_aggs
|
||||
from pyLibrary.queries.lists.aggs import is_aggs, list_aggs
|
||||
|
|
|
@ -10,24 +10,28 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from collections import Mapping
|
||||
|
||||
from copy import copy
|
||||
|
||||
from collections import Mapping
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.env import elasticsearch, http
|
||||
from pyLibrary.meta import use_settings
|
||||
from pyLibrary.queries import qb, expressions
|
||||
from pyLibrary.queries.containers import Container, config
|
||||
from pyLibrary.queries import qb, expressions, containers
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.domains import is_keyword
|
||||
from pyLibrary.queries.es09 import setop as es09_setop
|
||||
from pyLibrary.queries.es09.util import parse_columns, INDEX_CACHE
|
||||
from pyLibrary.queries.es14.aggs import es_aggsop, is_aggsop
|
||||
from pyLibrary.queries.es14.setop import is_fieldop, is_setop, es_setop, es_fieldop
|
||||
from pyLibrary.queries.es14.deep import is_deepop, es_deepop
|
||||
from pyLibrary.queries.es14.setop import is_setop, es_setop
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.es14.util import aggregates1_4
|
||||
from pyLibrary.queries.meta import FromESMetadata
|
||||
from pyLibrary.queries.namespace.typed import Typed
|
||||
from pyLibrary.queries.query import Query, _normalize_where
|
||||
from pyLibrary.debugs.logs import Log, Except
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import coalesce, split_field, set_default, literal_field, unwraplist
|
||||
from pyLibrary.dot import coalesce, split_field, literal_field, unwraplist, join_field
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, listwrap
|
||||
|
||||
|
@ -43,24 +47,27 @@ class FromES(Container):
|
|||
output.__init__(*args, **kwargs)
|
||||
return output
|
||||
else:
|
||||
output = object.__new__(cls)
|
||||
output.schema = None #TODO: WHERE IS THE SCHEMA?
|
||||
return output
|
||||
return Container.__new__(cls)
|
||||
|
||||
@use_settings
|
||||
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, settings=None):
|
||||
if not config.default:
|
||||
config.default.settings = settings
|
||||
Container.__init__(self, None, None)
|
||||
if not containers.config.default:
|
||||
containers.config.default.settings = settings
|
||||
self.settings = settings
|
||||
self.name = coalesce(name, alias, index)
|
||||
if read_only:
|
||||
self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings)
|
||||
else:
|
||||
self._es = elasticsearch.Cluster(settings=settings).get_index(read_only=read_only, settings=settings)
|
||||
|
||||
self.meta = FromESMetadata(settings=settings)
|
||||
self.settings.type = self._es.settings.type
|
||||
self.schema = Dict()
|
||||
self.edges = Dict()
|
||||
self.worker = None
|
||||
self._columns = None
|
||||
self._columns = self.get_columns()
|
||||
# SWITCH ON TYPED MODE
|
||||
self.typed = any(c.name in ("$value", "$object") for c in self._columns)
|
||||
|
||||
@staticmethod
|
||||
def wrap(es):
|
||||
|
@ -91,15 +98,26 @@ class FromES(Container):
|
|||
else:
|
||||
self.worker.join()
|
||||
|
||||
@property
|
||||
def query_path(self):
|
||||
return join_field(split_field(self.name)[1:])
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self._es.url
|
||||
|
||||
def query(self, query):
|
||||
def query(self, _query):
|
||||
try:
|
||||
query = Query(_query, schema=self)
|
||||
|
||||
for n in self.namespaces:
|
||||
query = n.convert(query)
|
||||
if self.typed:
|
||||
query = Typed().convert(query)
|
||||
|
||||
for s in listwrap(query.select):
|
||||
if not aggregates1_4[s.aggregate]:
|
||||
Log.error("ES can not aggregate " + self.select[0].name + " because '" + self.select[0].aggregate + "' is not a recognized aggregate")
|
||||
if not aggregates1_4.get(s.aggregate):
|
||||
Log.error("ES can not aggregate " + s.name + " because '" + s.aggregate + "' is not a recognized aggregate")
|
||||
|
||||
frum = query["from"]
|
||||
if isinstance(frum, Query):
|
||||
|
@ -108,10 +126,10 @@ class FromES(Container):
|
|||
q2.frum = result
|
||||
return qb.run(q2)
|
||||
|
||||
if is_deepop(self._es, query):
|
||||
return es_deepop(self._es, query)
|
||||
if is_aggsop(self._es, query):
|
||||
return es_aggsop(self._es, frum, query)
|
||||
if is_fieldop(self._es, query):
|
||||
return es_fieldop(self._es, query)
|
||||
if is_setop(self._es, query):
|
||||
return es_setop(self._es, query)
|
||||
if es09_setop.is_setop(query):
|
||||
|
@ -125,60 +143,47 @@ class FromES(Container):
|
|||
Log.error("Problem (Tried to clear Elasticsearch cache)", e)
|
||||
Log.error("problem", e)
|
||||
|
||||
def get_columns(self, table=None):
|
||||
query_path = self.query_path if self.query_path != "." else None
|
||||
abs_columns = self.meta.get_columns(table=coalesce(table, self.settings.index))
|
||||
|
||||
columns = []
|
||||
if query_path:
|
||||
depth = (len(c.nested_path) for c in abs_columns if c.nested_path[0] == query_path).next()
|
||||
# ADD RELATIVE COLUMNS
|
||||
for c in abs_columns:
|
||||
if c.nested_path[0] == query_path:
|
||||
c = copy(c)
|
||||
columns.append(c)
|
||||
c = copy(c)
|
||||
c.name = c.abs_name[len(query_path) + 1:] if c.type != "nested" else "."
|
||||
c.relative = True
|
||||
columns.append(c)
|
||||
elif not c.nested_path:
|
||||
c = copy(c)
|
||||
columns.append(c)
|
||||
c = copy(c)
|
||||
c.name = "." + ("." * depth) + c.abs_name
|
||||
c.relative = True
|
||||
columns.append(c)
|
||||
elif depth > len(c.nested_path) and query_path.startswith(c.nested_path[0] + "."):
|
||||
diff = depth - len(c.nested_path)
|
||||
c = copy(c)
|
||||
columns.append(c)
|
||||
c = copy(c)
|
||||
c.name = "." + ("." * diff) + (c.abs_name[len(c.nested_path[0]) + 1:] if c.type != "nested" else "")
|
||||
c.relative = True
|
||||
columns.append(c)
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
for c in abs_columns:
|
||||
if not c.nested_path:
|
||||
c = copy(c)
|
||||
c.relative = True
|
||||
columns.append(c)
|
||||
|
||||
def get_relative_columns(self):
|
||||
if self._columns:
|
||||
return self._columns
|
||||
|
||||
abs_columns=self._get_columns(self.settings.alias, self.path)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_columns(self, _from_name=None):
|
||||
"""
|
||||
ENSURE COLUMNS FOR GIVEN INDEX/QUERY ARE LOADED, SCRIPT COMPILATION WILL WORK BETTER
|
||||
|
||||
_from_name - NOT MEANT FOR EXTERNAL USE
|
||||
"""
|
||||
|
||||
if _from_name is None:
|
||||
_from_name = self.name
|
||||
if not isinstance(_from_name, basestring):
|
||||
Log.error("Expecting string")
|
||||
|
||||
output = INDEX_CACHE.get(_from_name)
|
||||
if output:
|
||||
# VERIFY es IS CONSISTENT
|
||||
if self.url != output.url:
|
||||
Log.error("Using {{name}} for two different containers\n\t{{existing}}\n\t{{new}}",
|
||||
name= _from_name,
|
||||
existing= output.url,
|
||||
new= self._es.url)
|
||||
return output.columns
|
||||
|
||||
path = split_field(_from_name)
|
||||
if len(path) > 1:
|
||||
# LOAD THE PARENT (WHICH WILL FILL THE INDEX_CACHE WITH NESTED CHILDREN)
|
||||
self.get_columns(_from_name=path[0])
|
||||
return INDEX_CACHE[_from_name].columns
|
||||
|
||||
schema = self._es.get_schema()
|
||||
properties = schema.properties
|
||||
INDEX_CACHE[_from_name] = output = Dict()
|
||||
output.name = _from_name
|
||||
output.url = self._es.url
|
||||
output.columns = parse_columns(_from_name, properties)
|
||||
return output.columns
|
||||
|
||||
|
||||
def get_column_names(self):
|
||||
# GET METADATA FOR INDEX
|
||||
# LIST ALL COLUMNS
|
||||
frum = self.get_columns()
|
||||
return frum.name
|
||||
return wrap(columns)
|
||||
|
||||
def addDimension(self, dim):
|
||||
if isinstance(dim, list):
|
||||
|
@ -189,14 +194,14 @@ class FromES(Container):
|
|||
dim.full_name = dim.name
|
||||
for e in dim.edges:
|
||||
d = Dimension(e, dim, self)
|
||||
self.schema[d.full_name] = d
|
||||
self.edges[d.full_name] = d
|
||||
|
||||
def __getitem__(self, item):
|
||||
e = self.schema[item]
|
||||
e = self.edges[item]
|
||||
return e
|
||||
|
||||
def __getattr__(self, item):
|
||||
return self.schema[item]
|
||||
return self.edges[item]
|
||||
|
||||
def normalize_edges(self, edges):
|
||||
output = DictList()
|
||||
|
@ -257,23 +262,15 @@ class FromES(Container):
|
|||
"size": 200000
|
||||
})
|
||||
|
||||
# SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
|
||||
scripts = DictList()
|
||||
for k, v in command.set.items():
|
||||
if not is_keyword(k):
|
||||
Log.error("Only support simple paths for now")
|
||||
|
||||
if "doc" in v.keys():
|
||||
# scripts.append({
|
||||
# "script": "ctx._source[" + convert.string2quote(k) + "] = param_",
|
||||
# "params": {"param_": v["doc"]}
|
||||
# })
|
||||
#SIMPLE DOC ASSIGNMENT
|
||||
scripts.append({"doc": {k: v["doc"]}})
|
||||
if isinstance(v, Mapping) and v.doc:
|
||||
scripts.append({"doc": v.doc})
|
||||
else:
|
||||
# SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
|
||||
scripts.append({
|
||||
"script": "ctx._source[" + convert.string2quote(k) + "] = " + expressions.qb_expression_to_ruby(v) + ";\n"
|
||||
})
|
||||
scripts.append({"script": "ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v)})
|
||||
|
||||
if results.hits.hits:
|
||||
updates = []
|
||||
|
@ -282,7 +279,7 @@ class FromES(Container):
|
|||
updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}})
|
||||
updates.append(s)
|
||||
content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8')
|
||||
response = self._es.cluster._post(
|
||||
response = self._es.cluster.post(
|
||||
self._es.path + "/_bulk",
|
||||
data=content,
|
||||
headers={"Content-Type": "application/json"}
|
||||
|
@ -290,97 +287,3 @@ class FromES(Container):
|
|||
if response.errors:
|
||||
Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
|
||||
|
||||
class FromESMetadata(Container):
|
||||
"""
|
||||
QUERY THE METADATA
|
||||
"""
|
||||
|
||||
@use_settings
|
||||
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
|
||||
self.settings = settings
|
||||
self.name = coalesce(name, alias, index)
|
||||
self._es = elasticsearch.Cluster(settings=settings)
|
||||
self.metadata = self._es.get_metadata()
|
||||
self.columns = None
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self._es.path + "/" + self.name.replace(".", "/")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
pass
|
||||
|
||||
def query(self, _query):
|
||||
if not self.columns:
|
||||
self.columns = []
|
||||
alias_done = set()
|
||||
metadata = self._es.get_metadata()
|
||||
for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
|
||||
for _, properties in meta.mappings.items():
|
||||
columns = _parse_properties(index, properties.properties)
|
||||
for c in columns:
|
||||
c.cube = index
|
||||
c.property = c.name
|
||||
c.name = None
|
||||
c.useSource = None
|
||||
|
||||
self.columns.extend(columns)
|
||||
for a in meta.aliases:
|
||||
# ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
|
||||
if a in alias_done:
|
||||
continue
|
||||
alias_done.add(a)
|
||||
for c in columns:
|
||||
self.columns.append(set_default({"cube": a}, c)) # ENSURE WE COPY
|
||||
|
||||
|
||||
return qb.run(set_default(
|
||||
{
|
||||
"from": self.columns,
|
||||
"sort": ["cube", "property"]
|
||||
},
|
||||
_query.as_dict()
|
||||
))
|
||||
|
||||
def get_columns(self, _=None):
|
||||
"""
|
||||
RETURN METADATA COLUMNS
|
||||
"""
|
||||
if self.name == "meta.columns":
|
||||
return wrap([
|
||||
{
|
||||
"name": "cube",
|
||||
"type": "string",
|
||||
"depth": 0
|
||||
}, {
|
||||
"name": "column",
|
||||
"type": "string",
|
||||
"depth": 0
|
||||
}, {
|
||||
"name": "type",
|
||||
"type": "string",
|
||||
"depth": 0
|
||||
}, {
|
||||
"name": "depth",
|
||||
"type": "integer",
|
||||
"depth": 0
|
||||
}
|
||||
])
|
||||
else:
|
||||
Log.error("Unknonw metadata: {{name}}", name= self.settings.name)
|
||||
|
||||
|
||||
def _parse_properties(index, properties):
|
||||
"""
|
||||
ISOLATE THE DEALING WITH THE INDEX_CACHE,
|
||||
INDEX_CACHE IS REDUNDANT WHEN YOU HAVE metadata.columns
|
||||
"""
|
||||
backup = INDEX_CACHE.get(index)
|
||||
INDEX_CACHE[index] = output = Dict()
|
||||
output.name = index
|
||||
columns = parse_columns(index, properties)
|
||||
INDEX_CACHE[index] = backup
|
||||
return columns
|
||||
|
|
|
@ -10,63 +10,93 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
from collections import Mapping
|
||||
|
||||
from pyLibrary.collections import AND, reverse
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import coalesce
|
||||
from pyLibrary.dot import wrap, listwrap
|
||||
from pyLibrary.dot import coalesce, split_field, join_field, Null
|
||||
from pyLibrary.dot.lists import DictList
|
||||
from pyLibrary.dot import wrap, unwrap, listwrap
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import expressions
|
||||
from pyLibrary.queries import wrap_from
|
||||
from pyLibrary.queries.containers import Container
|
||||
from pyLibrary.queries.normalize import _normalize_groupby, _normalize_edges, _normalize_where, _normalize_window, _normalize_sort, DEFAULT_LIMIT, _normalize_selects
|
||||
from pyLibrary.queries.dimensions import Dimension
|
||||
from pyLibrary.queries.domains import Domain, is_keyword
|
||||
from pyLibrary.queries.expressions import TRUE_FILTER, simplify_esfilter, query_get_all_vars
|
||||
|
||||
|
||||
DEFAULT_LIMIT = 10
|
||||
|
||||
qb = None
|
||||
|
||||
|
||||
def _late_import():
|
||||
global qb
|
||||
|
||||
from pyLibrary.queries import qb
|
||||
|
||||
_ = qb
|
||||
|
||||
|
||||
class Query(object):
|
||||
__slots__ = ["frum", "select", "edges", "groupby", "where", "window", "sort", "limit", "format", "isLean"]
|
||||
__slots__ = ["frum", "select", "edges", "groupby", "where", "window", "sort", "limit", "having", "format", "isLean"]
|
||||
|
||||
def __new__(cls, query, frum):
|
||||
def __new__(cls, query, schema=None):
|
||||
if isinstance(query, Query):
|
||||
return query
|
||||
return object.__new__(cls)
|
||||
output = object.__new__(cls)
|
||||
for s in Query.__slots__:
|
||||
setattr(output, s, None)
|
||||
return output
|
||||
|
||||
def __init__(self, query, frum):
|
||||
def __init__(self, query, schema=None):
|
||||
"""
|
||||
NORMALIZE QUERY SO IT CAN STILL BE JSON
|
||||
"""
|
||||
object.__init__(self)
|
||||
if isinstance(query, Query):
|
||||
if isinstance(query, Query) or query == None:
|
||||
return
|
||||
|
||||
object.__init__(self)
|
||||
query = wrap(query)
|
||||
|
||||
self.frum = frum
|
||||
if not isinstance(self.frum, Container):
|
||||
Log.error('Expecting from clause to be a Container')
|
||||
|
||||
self.format = query.format
|
||||
self.frum = wrap_from(query["from"], schema=schema)
|
||||
|
||||
if query.select:
|
||||
self.select = _normalize_selects(query.select, frum.schema)
|
||||
select = query.select
|
||||
if isinstance(select, list):
|
||||
names = set()
|
||||
new_select = []
|
||||
for s in select:
|
||||
ns = _normalize_select(s, schema=schema)
|
||||
if ns.name in names:
|
||||
Log.error("two select have the same name")
|
||||
names.add(ns.name)
|
||||
new_select.append(unwrap(ns))
|
||||
self.select = wrap(new_select)
|
||||
elif select:
|
||||
self.select = _normalize_select(select, schema=schema)
|
||||
else:
|
||||
if query.edges or query.groupby:
|
||||
self.select = {"name": "count", "value": ".", "aggregate": "count"}
|
||||
else:
|
||||
self.select = {"name": "__all__", "value": "*", "aggregate": "none"}
|
||||
self.select = {"name": ".", "value": "*", "aggregate": "none"}
|
||||
|
||||
if query.groupby and query.edges:
|
||||
Log.error("You can not use both the `groupby` and `edges` clauses in the same query!")
|
||||
elif query.edges:
|
||||
self.edges = _normalize_edges(query.edges, schema=self.frum.schema)
|
||||
self.edges = _normalize_edges(query.edges, schema=schema)
|
||||
self.groupby = None
|
||||
elif query.groupby:
|
||||
self.edges = None
|
||||
self.groupby = _normalize_groupby(query.groupby, schema=self.frum.schema)
|
||||
self.groupby = _normalize_groupby(query.groupby, schema=schema)
|
||||
else:
|
||||
self.edges = []
|
||||
self.groupby = None
|
||||
|
||||
self.where = _normalize_where(query.where, schema=self.frum.schema)
|
||||
self.where = _normalize_where(query.where, schema=schema)
|
||||
self.window = [_normalize_window(w) for w in listwrap(query.window)]
|
||||
self.having = None
|
||||
self.sort = _normalize_sort(query.sort)
|
||||
self.limit = coalesce(query.limit, DEFAULT_LIMIT)
|
||||
if not Math.is_integer(self.limit) or self.limit < 0:
|
||||
|
@ -77,9 +107,20 @@ class Query(object):
|
|||
|
||||
# DEPTH ANALYSIS - LOOK FOR COLUMN REFERENCES THAT MAY BE DEEPER THAN
|
||||
# THE from SOURCE IS.
|
||||
vars = get_all_vars(self, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
|
||||
for c in self.columns:
|
||||
if c.name in vars and c.depth:
|
||||
# TODO: IGNORE REACHING INTO THE NON-NESTED TYPES
|
||||
if isinstance(self.frum, list):
|
||||
if not qb:
|
||||
_late_import()
|
||||
columns = qb.get_columns(self.frum)
|
||||
elif isinstance(self.frum, Container):
|
||||
columns = self.frum.get_columns(table=query["from"])
|
||||
else:
|
||||
columns = []
|
||||
|
||||
query_path = coalesce(self.frum.query_path, "")
|
||||
vars = query_get_all_vars(self, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
|
||||
for c in columns:
|
||||
if c.name in vars and not query_path.startswith(coalesce(c.nested_path[0], "")):
|
||||
Log.error("This query, with variable {{var_name}} is too deep", var_name=c.name)
|
||||
|
||||
@property
|
||||
|
@ -102,48 +143,381 @@ class Query(object):
|
|||
return output
|
||||
|
||||
|
||||
def get_all_vars(query, exclude_where=False):
|
||||
"""
|
||||
:param query:
|
||||
:param exclude_where: Sometimes we do not what to look at the where clause
|
||||
:return: all variables in use by query
|
||||
"""
|
||||
output = []
|
||||
for s in listwrap(query.select):
|
||||
output.extend(select_get_all_vars(s))
|
||||
for s in listwrap(query.edges):
|
||||
output.extend(edges_get_all_vars(s))
|
||||
for s in listwrap(query.groupby):
|
||||
output.extend(edges_get_all_vars(s))
|
||||
if not exclude_where:
|
||||
output.extend(expressions.get_all_vars(query.where))
|
||||
canonical_aggregates = {
|
||||
"min": "minimum",
|
||||
"max": "maximum",
|
||||
"add": "sum",
|
||||
"avg": "average",
|
||||
"mean": "average"
|
||||
}
|
||||
|
||||
|
||||
def _normalize_selects(selects, schema=None):
|
||||
if isinstance(selects, list):
|
||||
output = wrap([_normalize_select(s, schema=schema) for s in selects])
|
||||
|
||||
exists = set()
|
||||
for s in output:
|
||||
if s.name in exists:
|
||||
Log.error("{{name}} has already been defined", name= s.name)
|
||||
exists.add(s.name)
|
||||
return output
|
||||
|
||||
|
||||
def select_get_all_vars(s):
|
||||
if isinstance(s.value, list):
|
||||
return set(s.value)
|
||||
elif isinstance(s.value, basestring):
|
||||
return set([s.value])
|
||||
elif s.value == None or s.value == ".":
|
||||
return set()
|
||||
else:
|
||||
if s.value == "*":
|
||||
return set(["*"])
|
||||
return expressions.get_all_vars(s.value)
|
||||
return _normalize_select(selects, schema=schema)
|
||||
|
||||
|
||||
def edges_get_all_vars(e):
|
||||
output = []
|
||||
if isinstance(e.value, basestring):
|
||||
output.append(e.value)
|
||||
if e.domain.key:
|
||||
output.append(e.domain.key)
|
||||
if e.domain.where:
|
||||
output.extend(expressions.get_all_vars(e.domain.where))
|
||||
if e.domain.partitions:
|
||||
for p in e.domain.partitions:
|
||||
if p.where:
|
||||
output.extend(expressions.get_all_vars(p.where))
|
||||
def _normalize_select(select, schema=None):
|
||||
if isinstance(select, basestring):
|
||||
select = select.rstrip(".")
|
||||
if not select:
|
||||
return Dict(
|
||||
name=".",
|
||||
value="*",
|
||||
aggregate="none"
|
||||
)
|
||||
if schema:
|
||||
s = schema[select]
|
||||
if s:
|
||||
return s.getSelect()
|
||||
|
||||
if select.endswith(".*"):
|
||||
name = select[:-2]
|
||||
else:
|
||||
name = select
|
||||
|
||||
return Dict(
|
||||
name=name,
|
||||
value=select,
|
||||
aggregate="none"
|
||||
)
|
||||
else:
|
||||
select = wrap(select)
|
||||
output = select.copy()
|
||||
if not select.value or isinstance(select.value, basestring):
|
||||
if select.value == ".":
|
||||
output.name = coalesce(select.name, select.aggregate)
|
||||
else:
|
||||
output.name = coalesce(select.name, select.value, select.aggregate)
|
||||
elif not output.name:
|
||||
Log.error("Must give name to each column in select clause")
|
||||
|
||||
if not output.name:
|
||||
Log.error("expecting select to have a name: {{select}}", select= select)
|
||||
if output.name.endswith(".*"):
|
||||
output.name = output.name[:-2]
|
||||
|
||||
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
|
||||
return output
|
||||
|
||||
|
||||
def _normalize_edges(edges, schema=None):
|
||||
return [_normalize_edge(e, schema=schema) for e in listwrap(edges)]
|
||||
|
||||
|
||||
def _normalize_edge(edge, schema=None):
|
||||
if isinstance(edge, basestring):
|
||||
if schema:
|
||||
e = schema[edge]
|
||||
if e:
|
||||
if isinstance(e.fields, list) and len(e.fields) == 1:
|
||||
return Dict(
|
||||
name=e.name,
|
||||
value=e.fields[0],
|
||||
allowNulls=True,
|
||||
domain=e.getDomain()
|
||||
)
|
||||
else:
|
||||
return Dict(
|
||||
name=e.name,
|
||||
allowNulls=True,
|
||||
domain=e.getDomain()
|
||||
)
|
||||
return Dict(
|
||||
name=edge,
|
||||
value=edge,
|
||||
allowNulls=True,
|
||||
domain=_normalize_domain(schema=schema)
|
||||
)
|
||||
else:
|
||||
edge = wrap(edge)
|
||||
if not edge.name and not isinstance(edge.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= edge)
|
||||
|
||||
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
|
||||
# COMPLEX EDGE IS SHORT HAND
|
||||
domain = _normalize_domain(schema=schema)
|
||||
domain.dimension = Dict(fields=edge.value)
|
||||
|
||||
return Dict(
|
||||
name=edge.name,
|
||||
allowNulls=bool(coalesce(edge.allowNulls, True)),
|
||||
domain=domain
|
||||
)
|
||||
|
||||
domain = _normalize_domain(edge.domain, schema=schema)
|
||||
return Dict(
|
||||
name=coalesce(edge.name, edge.value),
|
||||
value=edge.value,
|
||||
range=edge.range,
|
||||
allowNulls=bool(coalesce(edge.allowNulls, True)),
|
||||
domain=domain
|
||||
)
|
||||
|
||||
|
||||
def _normalize_groupby(groupby, schema=None):
|
||||
if groupby == None:
|
||||
return None
|
||||
return [_normalize_group(e, schema=schema) for e in listwrap(groupby)]
|
||||
|
||||
|
||||
def _normalize_group(edge, schema=None):
|
||||
if isinstance(edge, basestring):
|
||||
return wrap({
|
||||
"name": edge,
|
||||
"value": edge,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
else:
|
||||
edge = wrap(edge)
|
||||
if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
|
||||
Log.error("groupby does not accept complicated domains")
|
||||
|
||||
if not edge.name and not isinstance(edge.value, basestring):
|
||||
Log.error("You must name compound edges: {{edge}}", edge= edge)
|
||||
|
||||
return wrap({
|
||||
"name": coalesce(edge.name, edge.value),
|
||||
"value": edge.value,
|
||||
"domain": {"type": "default"}
|
||||
})
|
||||
|
||||
|
||||
def _normalize_domain(domain=None, schema=None):
|
||||
if not domain:
|
||||
return Domain(type="default")
|
||||
elif isinstance(domain, Dimension):
|
||||
return domain.getDomain()
|
||||
elif schema and isinstance(domain, basestring) and schema[domain]:
|
||||
return schema[domain].getDomain()
|
||||
elif isinstance(domain, Domain):
|
||||
return domain
|
||||
|
||||
if not domain.name:
|
||||
domain = domain.copy()
|
||||
domain.name = domain.type
|
||||
|
||||
if not isinstance(domain.partitions, list):
|
||||
domain.partitions = list(domain.partitions)
|
||||
|
||||
return Domain(**domain)
|
||||
|
||||
|
||||
def _normalize_window(window, schema=None):
|
||||
return Dict(
|
||||
name=coalesce(window.name, window.value),
|
||||
value=window.value,
|
||||
edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)],
|
||||
sort=_normalize_sort(window.sort),
|
||||
aggregate=window.aggregate,
|
||||
range=_normalize_range(window.range),
|
||||
where=_normalize_where(window.where, schema=schema)
|
||||
)
|
||||
|
||||
|
||||
def _normalize_range(range):
|
||||
if range == None:
|
||||
return None
|
||||
|
||||
return Dict(
|
||||
min=range.min,
|
||||
max=range.max
|
||||
)
|
||||
|
||||
|
||||
def _normalize_where(where, schema=None):
|
||||
if where == None:
|
||||
return TRUE_FILTER
|
||||
if schema == None:
|
||||
return where
|
||||
where = simplify_esfilter(_where_terms(where, where, schema))
|
||||
return where
|
||||
|
||||
|
||||
def _map_term_using_schema(master, path, term, schema_edges):
|
||||
"""
|
||||
IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM
|
||||
"""
|
||||
output = DictList()
|
||||
for k, v in term.items():
|
||||
dimension = schema_edges[k]
|
||||
if isinstance(dimension, Dimension):
|
||||
domain = dimension.getDomain()
|
||||
if dimension.fields:
|
||||
if isinstance(dimension.fields, Mapping):
|
||||
# EXPECTING A TUPLE
|
||||
for local_field, es_field in dimension.fields.items():
|
||||
local_value = v[local_field]
|
||||
if local_value == None:
|
||||
output.append({"missing": {"field": es_field}})
|
||||
else:
|
||||
output.append({"term": {es_field: local_value}})
|
||||
continue
|
||||
|
||||
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
|
||||
# SIMPLE SINGLE-VALUED FIELD
|
||||
if domain.getPartByKey(v) is domain.NULL:
|
||||
output.append({"missing": {"field": dimension.fields[0]}})
|
||||
else:
|
||||
output.append({"term": {dimension.fields[0]: v}})
|
||||
continue
|
||||
|
||||
if AND(is_keyword(f) for f in dimension.fields):
|
||||
# EXPECTING A TUPLE
|
||||
if not isinstance(v, tuple):
|
||||
Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v)
|
||||
for i, f in enumerate(dimension.fields):
|
||||
vv = v[i]
|
||||
if vv == None:
|
||||
output.append({"missing": {"field": f}})
|
||||
else:
|
||||
output.append({"term": {f: vv}})
|
||||
continue
|
||||
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
|
||||
if domain.getPartByKey(v) is domain.NULL:
|
||||
output.append({"missing": {"field": dimension.fields[0]}})
|
||||
else:
|
||||
output.append({"term": {dimension.fields[0]: v}})
|
||||
continue
|
||||
if domain.partitions:
|
||||
part = domain.getPartByKey(v)
|
||||
if part is domain.NULL or not part.esfilter:
|
||||
Log.error("not expected to get NULL")
|
||||
output.append(part.esfilter)
|
||||
continue
|
||||
else:
|
||||
Log.error("not expected")
|
||||
elif isinstance(v, Mapping):
|
||||
sub = _map_term_using_schema(master, path + [k], v, schema_edges[k])
|
||||
output.append(sub)
|
||||
continue
|
||||
|
||||
output.append({"term": {k: v}})
|
||||
return {"and": output}
|
||||
|
||||
|
||||
def _move_nested_term(master, where, schema):
|
||||
"""
|
||||
THE WHERE CLAUSE CAN CONTAIN NESTED PROPERTY REFERENCES, THESE MUST BE MOVED
|
||||
TO A NESTED FILTER
|
||||
"""
|
||||
items = where.term.items()
|
||||
if len(items) != 1:
|
||||
Log.error("Expecting only one term")
|
||||
k, v = items[0]
|
||||
nested_path = _get_nested_path(k, schema)
|
||||
if nested_path:
|
||||
return {"nested": {
|
||||
"path": nested_path,
|
||||
"query": {"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": {"and": [
|
||||
{"term": {k: v}}
|
||||
]}
|
||||
}}
|
||||
}}
|
||||
return where
|
||||
|
||||
|
||||
def _get_nested_path(field, schema):
|
||||
if is_keyword(field):
|
||||
field = join_field([schema.es.alias] + split_field(field))
|
||||
for i, f in reverse(enumerate(split_field(field))):
|
||||
path = join_field(split_field(field)[0:i + 1:])
|
||||
if path in INDEX_CACHE:
|
||||
return join_field(split_field(path)[1::])
|
||||
return None
|
||||
|
||||
|
||||
def _where_terms(master, where, schema):
|
||||
"""
|
||||
USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
|
||||
master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
|
||||
"""
|
||||
if isinstance(where, Mapping):
|
||||
if where.term:
|
||||
# MAP TERM
|
||||
try:
|
||||
output = _map_term_using_schema(master, [], where.term, schema.edges)
|
||||
return output
|
||||
except Exception, e:
|
||||
Log.error("programmer problem?", e)
|
||||
elif where.terms:
|
||||
# MAP TERM
|
||||
output = DictList()
|
||||
for k, v in where.terms.items():
|
||||
if not isinstance(v, (list, set)):
|
||||
Log.error("terms filter expects list of values")
|
||||
edge = schema.edges[k]
|
||||
if not edge:
|
||||
output.append({"terms": {k: v}})
|
||||
else:
|
||||
if isinstance(edge, basestring):
|
||||
# DIRECT FIELD REFERENCE
|
||||
return {"terms": {edge: v}}
|
||||
try:
|
||||
domain = edge.getDomain()
|
||||
except Exception, e:
|
||||
Log.error("programmer error", e)
|
||||
fields = domain.dimension.fields
|
||||
if isinstance(fields, Mapping):
|
||||
or_agg = []
|
||||
for vv in v:
|
||||
and_agg = []
|
||||
for local_field, es_field in fields.items():
|
||||
vvv = vv[local_field]
|
||||
if vvv != None:
|
||||
and_agg.append({"term": {es_field: vvv}})
|
||||
or_agg.append({"and": and_agg})
|
||||
output.append({"or": or_agg})
|
||||
elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]):
|
||||
output.append({"terms": {fields[0]: v}})
|
||||
elif domain.partitions:
|
||||
output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]})
|
||||
return {"and": output}
|
||||
elif where["or"]:
|
||||
return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]}
|
||||
elif where["and"]:
|
||||
return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]}
|
||||
elif where["not"]:
|
||||
return {"not": unwrap(_where_terms(master, where["not"], schema))}
|
||||
return where
|
||||
|
||||
|
||||
def _normalize_sort(sort=None):
|
||||
"""
|
||||
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
|
||||
"""
|
||||
|
||||
if not sort:
|
||||
return DictList.EMPTY
|
||||
|
||||
output = DictList()
|
||||
for s in listwrap(sort):
|
||||
if isinstance(s, basestring) or Math.is_integer(s):
|
||||
output.append({"value": s, "sort": 1})
|
||||
else:
|
||||
output.append({"value": coalesce(s.value, s.field), "sort": coalesce(sort_direction[s.sort], 1)})
|
||||
return wrap(output)
|
||||
|
||||
|
||||
sort_direction = {
|
||||
"asc": 1,
|
||||
"desc": -1,
|
||||
"none": 0,
|
||||
1: 1,
|
||||
0: 0,
|
||||
-1: -1,
|
||||
None: 1,
|
||||
Null: 1
|
||||
}
|
||||
|
||||
|
|
|
@ -405,6 +405,7 @@ class MySQL(object):
|
|||
)
|
||||
|
||||
@staticmethod
|
||||
@use_settings
|
||||
def execute_file(
|
||||
filename,
|
||||
host,
|
||||
|
@ -424,7 +425,7 @@ class MySQL(object):
|
|||
except Exception, e:
|
||||
pass
|
||||
else:
|
||||
MySQL.execute_sql(settings, sql, param)
|
||||
MySQL.execute_sql(sql=sql, param=param, settings=settings)
|
||||
|
||||
def _execute_backlog(self):
|
||||
if not self.backlog: return
|
||||
|
|
|
@ -55,7 +55,7 @@ def unix(value):
|
|||
|
||||
def url(value):
|
||||
"""
|
||||
CONVERT FROM dict OR string TO URL PARAMETERS
|
||||
_CONVERT FROM dict OR string TO URL PARAMETERS
|
||||
"""
|
||||
if not _convert:
|
||||
_late_import()
|
||||
|
@ -65,7 +65,7 @@ def url(value):
|
|||
|
||||
def html(value):
|
||||
"""
|
||||
CONVERT FROM unicode TO HTML OF THE SAME
|
||||
_CONVERT FROM unicode TO HTML OF THE SAME
|
||||
"""
|
||||
if not _convert:
|
||||
_late_import()
|
||||
|
@ -553,14 +553,14 @@ def utf82unicode(value):
|
|||
_late_import()
|
||||
|
||||
if not isinstance(value, basestring):
|
||||
_Log.error("Can not convert {{type}} to unicode because it's not a string", type= type(value).__name__)
|
||||
_Log.error("Can not _convert {{type}} to unicode because it's not a string", type= type(value).__name__)
|
||||
|
||||
e = _Except.wrap(e)
|
||||
for i, c in enumerate(value):
|
||||
try:
|
||||
c.decode("utf8")
|
||||
except Exception, f:
|
||||
_Log.error("Can not convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)])
|
||||
_Log.error("Can not _convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)])
|
||||
|
||||
try:
|
||||
latin1 = unicode(value.decode("latin1"))
|
||||
|
|
|
@ -18,6 +18,8 @@ from pyLibrary.env.files import File
|
|||
from pyLibrary.queries import qb
|
||||
from pyLibrary.dot.dicts import Dict
|
||||
from pyLibrary.dot import unwrap, wrap
|
||||
from pyLibrary.queries.expressions import qb_expression_to_function
|
||||
|
||||
|
||||
def make_test_instance(name, settings):
|
||||
if settings.filename:
|
||||
|
@ -56,7 +58,7 @@ class Fake_ES():
|
|||
|
||||
def search(self, query):
|
||||
query=wrap(query)
|
||||
f = convert.esfilter2where(query.query.filtered.filter)
|
||||
f = qb_expression_to_function(query.query.filtered.filter)
|
||||
filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
|
||||
if query.fields:
|
||||
return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(qb.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})
|
||||
|
|
|
@ -8,13 +8,12 @@
|
|||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
from collections import Mapping
|
||||
|
||||
import unittest
|
||||
|
||||
from pyLibrary import dot
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import coalesce, Dict, literal_field
|
||||
from pyLibrary.dot import coalesce, literal_field
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.dot import wrap
|
||||
from pyLibrary.queries.unique_index import UniqueIndex
|
||||
from pyLibrary.strings import expand_template
|
||||
|
||||
|
@ -68,7 +67,7 @@ def zipall(*args):
|
|||
while True:
|
||||
output = zip(*(_next(a) for a in iters))
|
||||
if all(output[0]):
|
||||
return
|
||||
raise StopIteration
|
||||
else:
|
||||
yield output[1]
|
||||
|
||||
|
|
|
@ -10,33 +10,19 @@ Module `threads`
|
|||
The main distinction between this library and Python's is two-fold:
|
||||
|
||||
1. **Multi-threaded queues do not use serialization** - Serialization is great in the general case, where you may also be communicating between processes, but it is a needless overhead for single-process multi-threading. It is left to the programmer to ensure the messages put on the queue are not changed, which is not ominous demand.
|
||||
2. **Shutdown order is deterministic and explicit** - Python's threading library is missing strict conventions for controlled and orderly shutdown.
|
||||
2. **Shutdown order is deterministic and explicit, if desired** - If there is one aspect of threading library that is missing, it will be a lack of controlled and orderly shutdown.
|
||||
* All threads are required to accept a `please_stop` token and are expected to test for its signal in a timely manner and exit when signalled.
|
||||
* All threads have a parent - The parent is responsible for ensuring their children get the `please_stop` signal, and are dead, before stopping themselves.
|
||||
* All threads have a parent, which are ultimately responsible for ensuring their children get the `please_stop` signal, and are dead before stopping themselves.
|
||||
|
||||
These conventions eliminate the need for `interrupt()` and `abort()`, both of which are unstable idioms when there are resources. Each thread can shutdown on its own terms, but is expected to do so expediently.
|
||||
|
||||
###What's it used for###
|
||||
|
||||
A good amount of time is spent waiting for underlying C libraries and OS
|
||||
services to respond to network and file access requests. Multiple
|
||||
threads can make your code faster despite the GIL when dealing with those
|
||||
requests. For example, by moving logging off the main thread, we can get
|
||||
up to 15% increase in overall speed because we no longer have the main thread
|
||||
waiting for disk writes or remote logging posts. Please note, this level of
|
||||
speed improvement can only be realized if there is no serialization happening
|
||||
at the multi-threaded queue.
|
||||
A good amount of time is spent waiting for underlying C libraries and OS services to respond to network and file access requests. Multiple threads can make your code faster despite the GIL when dealing with those requests. For example, by moving logging off the main thread, we can get up to 15% increase in overall speed because we no longer have the main thread waiting for disk writes or . Please note, this level of speed improvement can only be realized if there is no serialization happening at the multi-threaded queue.
|
||||
|
||||
###Asynch vs. Actors###
|
||||
|
||||
My personal belief is that [actors](http://en.wikipedia.org/wiki/Actor_model)
|
||||
are easier to reason about than [asynch tasks](https://docs.python.org/3/library/asyncio-task.html).
|
||||
Mixing regular methods and co-routines (with their `yield from` pollution) is
|
||||
dangerous because:
|
||||
1) calling styles between methods and co-routines can be easily confused
|
||||
2) actors can use methods, co-routines can not
|
||||
3) there is no way to manage resource priority with co-routines.
|
||||
4) stack traces are lost with co-routines
|
||||
My personal belief is that [actors](http://en.wikipedia.org/wiki/Actor_model) are easier to reason about, and
|
||||
|
||||
Synchronization Primitives
|
||||
--------------------------
|
||||
|
@ -45,39 +31,10 @@ There are three major aspects of a synchronization primitive:
|
|||
|
||||
* **Resource** - Monitors and locks can only be owned by one thread at a time
|
||||
* **Binary** - The primitive has only two states
|
||||
* **Irreversible** - The state of the primitive can only be set, or advanced, never reversed
|
||||
* **Reversible** - The state of the primitive can be set, or advanced, and reversed again
|
||||
|
||||
The last, *irreversibility* is very useful, but ignored in many threading
|
||||
libraries. The irreversibility allows us to model progression; and
|
||||
we can allow threads to poll for progress, or be notified of progress.
|
||||
|
||||
These three aspects can be combined to give us 8 synchronization primitives:
|
||||
|
||||
* `- - -` - Semaphore
|
||||
* `- B -` - Binary Semaphore
|
||||
* `R - -` - Monitor
|
||||
* `R B -` - Lock
|
||||
* `- - I` - Progress
|
||||
* `- B I` - Signal
|
||||
* `R - I` - ?limited usefulness?
|
||||
* `R B I` - ?limited usefulness?
|
||||
The last, *reversibility* is very useful, but ignored in many threading libraries. The lack of reversibility allows us to model progression; and we can allow threads to poll for progress, or be notified of progress.
|
||||
|
||||
###Class `Signal`###
|
||||
|
||||
An irreversible binary semaphore used to signal state progression.
|
||||
|
||||
**Method `wait_for_go(self, timeout=None, till=None)`**
|
||||
|
||||
Put a thread into a waiting state until the signal is activated
|
||||
|
||||
**Method `go(self)`**
|
||||
|
||||
Activate the signal. Does nothing if already activated.
|
||||
|
||||
**Method `is_go(self)`**
|
||||
|
||||
Test if the signal is activated, do not wait`
|
||||
|
||||
**Method `on_go(self, target)`**
|
||||
|
||||
Run the `target` method when the signal is activated. The activating thread will be running the target method, so be sure you are not accessing resources.
|
||||
|
|
|
@ -10,102 +10,71 @@ from __future__ import unicode_literals
|
|||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
import subprocess
|
||||
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.thread.threads import Queue
|
||||
|
||||
# YOU ARE READING AN INCOMPLETE IMPLEMENTATION
|
||||
|
||||
class worker(object):
|
||||
def __init__(func, inbound, outbound, logging):
|
||||
logger = Log_usingInterProcessQueue(logging)
|
||||
from pyLibrary.thread.threads import Queue, Thread, Signal
|
||||
|
||||
|
||||
DEBUG=True
|
||||
|
||||
class Log_usingInterProcessQueue(Log):
|
||||
def __init__(self, outbound):
|
||||
self.outbound = outbound
|
||||
class Process(object):
|
||||
|
||||
def write(self, template, params):
|
||||
self.outbound.put({"template": template, "param": params})
|
||||
def __init__(self, name, params, cwd=None):
|
||||
self.name = name
|
||||
self.service_stopped = Signal()
|
||||
self.send = Queue("send")
|
||||
self.recieve = Queue("recieve")
|
||||
|
||||
|
||||
class Multiprocess(object):
|
||||
# THE COMPLICATION HERE IS CONNECTING THE DISPARATE LOGGING TO
|
||||
# A CENTRAL POINT
|
||||
# ONLY THE MAIN THREAD CAN CREATE AND COMMUNICATE WITH multiprocess.Process
|
||||
|
||||
|
||||
def __init__(self, functions):
|
||||
self.outbound = Queue("out to process")
|
||||
self.inbound = Queue("in from stdin")
|
||||
self.inbound = Queue("in from stderr")
|
||||
|
||||
# MAKE
|
||||
|
||||
# MAKE THREADS
|
||||
self.threads = []
|
||||
for t, f in enumerate(functions):
|
||||
thread = worker(
|
||||
"worker " + unicode(t),
|
||||
f,
|
||||
self.inbound,
|
||||
self.outbound,
|
||||
try:
|
||||
self.service = service = subprocess.Popen(
|
||||
params,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=-1,
|
||||
cwd=cwd
|
||||
)
|
||||
self.threads.append(thread)
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
# WAIT FOR ALL QUEUED WORK TO BE DONE BEFORE RETURNING
|
||||
def __exit__(self, a, b, c):
|
||||
try:
|
||||
self.inbound.close() # SEND STOPS TO WAKE UP THE WORKERS WAITING ON inbound.pop()
|
||||
self.stopper = Signal()
|
||||
self.stopper.on_go(lambda: service.kill())
|
||||
Thread.run(self.name+" waiter", waiter, self)
|
||||
Thread.run(self.name+" stdout", reader, service.stdout, self.recieve, please_stop=self.stopper)
|
||||
Thread.run(self.name+" stderr", reader, service.stderr, self.recieve, please_stop=self.stopper)
|
||||
Thread.run(self.name+" stdin", writer, service.stdin, self.recieve, please_stop=self.stopper)
|
||||
except Exception, e:
|
||||
Log.warning("Problem adding to inbound", e)
|
||||
Log.error("Can not call", e)
|
||||
|
||||
self.join()
|
||||
|
||||
|
||||
# IF YOU SENT A stop(), OR STOP, YOU MAY WAIT FOR SHUTDOWN
|
||||
def join(self):
|
||||
try:
|
||||
# WAIT FOR FINISH
|
||||
for t in self.threads:
|
||||
t.join()
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
Log.note("Shutdow Started, please be patient")
|
||||
except Exception, e:
|
||||
Log.error("Unusual shutdown!", e)
|
||||
finally:
|
||||
for t in self.threads:
|
||||
t.keep_running = False
|
||||
for t in self.threads:
|
||||
t.join()
|
||||
self.inbound.close()
|
||||
self.outbound.close()
|
||||
|
||||
|
||||
# RETURN A GENERATOR THAT HAS len(parameters) RESULTS (ANY ORDER)
|
||||
def execute(self, parameters):
|
||||
# FILL QUEUE WITH WORK
|
||||
self.inbound.extend(parameters)
|
||||
|
||||
num = len(parameters)
|
||||
|
||||
def output():
|
||||
for i in xrange(num):
|
||||
result = self.outbound.pop()
|
||||
yield result
|
||||
|
||||
return output()
|
||||
|
||||
# EXTERNAL COMMAND THAT RETURNS IMMEDIATELY
|
||||
def stop(self):
|
||||
self.inbound.close() # SEND STOPS TO WAKE UP THE WORKERS WAITING ON inbound.pop()
|
||||
for t in self.threads:
|
||||
t.keep_running = False
|
||||
self.stopper.go()
|
||||
self.send.add("exit")
|
||||
|
||||
def join(self):
|
||||
self.service_stopped.wait_for_go()
|
||||
|
||||
|
||||
def waiter(this, please_stop):
|
||||
this.service.wait()
|
||||
if DEBUG:
|
||||
Log.alert("{{name}} stopped", name=this.name)
|
||||
this.service_stopped.go()
|
||||
|
||||
def reader(stdout, recieve, please_stop):
|
||||
while not please_stop:
|
||||
line = stdout.readline()
|
||||
if line:
|
||||
recieve.add(line)
|
||||
Log.note("FROM PROCESS: {{line}}", line=line.rstrip())
|
||||
else:
|
||||
Thread.sleep(1)
|
||||
stdout.close()
|
||||
|
||||
|
||||
def writer(stdin, send, please_stop):
|
||||
while not please_stop:
|
||||
line = send.pop()
|
||||
if line:
|
||||
stdin.write(line+"\n")
|
||||
stdin.close()
|
||||
|
||||
|
||||
|
|
|
@ -29,17 +29,17 @@ from pyLibrary.times.dates import Date
|
|||
from pyLibrary.times.durations import SECOND
|
||||
|
||||
|
||||
_Log = None
|
||||
Log = None
|
||||
DEBUG = True
|
||||
MAX_DATETIME = datetime(2286, 11, 20, 17, 46, 39)
|
||||
|
||||
|
||||
def _late_import():
|
||||
global _Log
|
||||
global Log
|
||||
|
||||
from pyLibrary.debugs.logs import Log as _Log
|
||||
from pyLibrary.debugs.logs import Log
|
||||
|
||||
_ = _Log
|
||||
_ = Log
|
||||
|
||||
|
||||
class Lock(object):
|
||||
|
@ -67,7 +67,7 @@ class Lock(object):
|
|||
|
||||
def wait(self, timeout=None, till=None):
|
||||
if till:
|
||||
timeout = (datetime.utcnow() - till).total_seconds()
|
||||
timeout = (till - Date.now()).seconds
|
||||
if timeout < 0:
|
||||
return
|
||||
self.monitor.wait(timeout=float(timeout) if timeout else None)
|
||||
|
@ -94,7 +94,6 @@ class Queue(object):
|
|||
self.lock = Lock("lock for queue " + name)
|
||||
self.queue = deque()
|
||||
self.next_warning = datetime.utcnow() # FOR DEBUGGING
|
||||
self.gc_count = 0
|
||||
|
||||
def __iter__(self):
|
||||
while self.keep_running:
|
||||
|
@ -103,9 +102,9 @@ class Queue(object):
|
|||
if value is not Thread.STOP:
|
||||
yield value
|
||||
except Exception, e:
|
||||
_Log.warning("Tell me about what happened here", e)
|
||||
Log.warning("Tell me about what happened here", e)
|
||||
|
||||
_Log.note("queue iterator is done")
|
||||
Log.note("queue iterator is done")
|
||||
|
||||
|
||||
def add(self, value):
|
||||
|
@ -115,6 +114,18 @@ class Queue(object):
|
|||
self.queue.append(value)
|
||||
return self
|
||||
|
||||
def push(self, value):
|
||||
"""
|
||||
SNEAK value TO FRONT OF THE QUEUE
|
||||
"""
|
||||
with self.lock:
|
||||
self._wait_for_queue_space()
|
||||
if self.keep_running:
|
||||
self.queue.appendleft(value)
|
||||
return self
|
||||
|
||||
|
||||
|
||||
def extend(self, values):
|
||||
with self.lock:
|
||||
# ONCE THE queue IS BELOW LIMIT, ALLOW ADDING MORE
|
||||
|
@ -142,7 +153,7 @@ class Queue(object):
|
|||
now = datetime.utcnow()
|
||||
if self.next_warning < now:
|
||||
self.next_warning = now + timedelta(seconds=wait_time)
|
||||
_Log.alert("Queue {{name}} is full ({{num}} items), thread(s) have been waiting {{wait_time}} sec",
|
||||
Log.alert("Queue {{name}} is full ({{num}} items), thread(s) have been waiting {{wait_time}} sec",
|
||||
name=self.name,
|
||||
num=len(self.queue),
|
||||
wait_time=wait_time
|
||||
|
@ -156,20 +167,21 @@ class Queue(object):
|
|||
with self.lock:
|
||||
return any(r != Thread.STOP for r in self.queue)
|
||||
|
||||
def pop(self, till=None):
|
||||
def pop(self, till=None, timeout=None):
|
||||
"""
|
||||
WAIT FOR NEXT ITEM ON THE QUEUE
|
||||
RETURN Thread.STOP IF QUEUE IS CLOSED
|
||||
IF till IS PROVIDED, THEN pop() CAN TIMEOUT AND RETURN None
|
||||
"""
|
||||
|
||||
if timeout:
|
||||
till = Date.now() + timeout
|
||||
|
||||
with self.lock:
|
||||
if till == None:
|
||||
if not till:
|
||||
while self.keep_running:
|
||||
if self.queue:
|
||||
value = self.queue.popleft()
|
||||
self.gc_count += 1
|
||||
if self.gc_count % 1000 == 0:
|
||||
gc.collect()
|
||||
if value is Thread.STOP: # SENDING A STOP INTO THE QUEUE IS ALSO AN OPTION
|
||||
self.keep_running = False
|
||||
return value
|
||||
|
@ -195,7 +207,7 @@ class Queue(object):
|
|||
if self.keep_running:
|
||||
return None
|
||||
|
||||
_Log.note("queue stopped")
|
||||
Log.note("queue stopped")
|
||||
return Thread.STOP
|
||||
|
||||
|
||||
|
@ -217,6 +229,21 @@ class Queue(object):
|
|||
self.queue.clear()
|
||||
return output
|
||||
|
||||
def pop_one(self):
|
||||
"""
|
||||
NON-BLOCKING POP IN QUEUE, IF ANY
|
||||
"""
|
||||
with self.lock:
|
||||
if not self.keep_running:
|
||||
return [Thread.STOP]
|
||||
elif not self.queue:
|
||||
return None
|
||||
else:
|
||||
v =self.queue.pop()
|
||||
if v is Thread.STOP: # SENDING A STOP INTO THE QUEUE IS ALSO AN OPTION
|
||||
self.keep_running = False
|
||||
return v
|
||||
|
||||
def close(self):
|
||||
with self.lock:
|
||||
self.keep_running = False
|
||||
|
@ -237,7 +264,7 @@ class AllThread(object):
|
|||
"""
|
||||
|
||||
def __init__(self):
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
self.threads = []
|
||||
|
||||
|
@ -256,10 +283,10 @@ class AllThread(object):
|
|||
if "exception" in response:
|
||||
exceptions.append(response["exception"])
|
||||
except Exception, e:
|
||||
_Log.warning("Problem joining", e)
|
||||
Log.warning("Problem joining", e)
|
||||
|
||||
if exceptions:
|
||||
_Log.error("Problem in child threads", exceptions)
|
||||
Log.error("Problem in child threads", exceptions)
|
||||
|
||||
|
||||
def add(self, target, *args, **kwargs):
|
||||
|
@ -292,7 +319,7 @@ class MainThread(object):
|
|||
children = copy(self.children)
|
||||
for c in reversed(children):
|
||||
if c.name:
|
||||
_Log.note("Stopping thread {{name|quote}}", name=c.name)
|
||||
Log.note("Stopping thread {{name|quote}}", name=c.name)
|
||||
c.stop()
|
||||
for c in children:
|
||||
c.join()
|
||||
|
@ -317,7 +344,7 @@ class Thread(object):
|
|||
|
||||
|
||||
def __init__(self, name, target, *args, **kwargs):
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
self.id = -1
|
||||
self.name = name
|
||||
|
@ -357,14 +384,14 @@ class Thread(object):
|
|||
self.kwargs = None
|
||||
|
||||
def start(self):
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
|
||||
try:
|
||||
self.thread = thread.start_new_thread(Thread._run, (self, ))
|
||||
return self
|
||||
except Exception, e:
|
||||
_Log.error("Can not start thread", e)
|
||||
Log.error("Can not start thread", e)
|
||||
|
||||
def stop(self):
|
||||
for c in copy(self.children):
|
||||
|
@ -378,7 +405,7 @@ class Thread(object):
|
|||
self.children.remove(child)
|
||||
|
||||
def _run(self):
|
||||
if _Log.cprofiler:
|
||||
if Log.cprofiler:
|
||||
import cProfile
|
||||
|
||||
self.cprofiler = cProfile.Profile()
|
||||
|
@ -398,7 +425,7 @@ class Thread(object):
|
|||
with self.synch_lock:
|
||||
self.response = Dict(exception=e)
|
||||
try:
|
||||
_Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
|
||||
Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
|
||||
except Exception, f:
|
||||
sys.stderr.write("ERROR in thread: " + str(self.name) + " " + str(e) + "\n")
|
||||
finally:
|
||||
|
@ -420,7 +447,7 @@ class Thread(object):
|
|||
import pstats
|
||||
|
||||
self.cprofiler.disable()
|
||||
_Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
|
||||
Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
|
||||
del self.cprofiler
|
||||
|
||||
def is_alive(self):
|
||||
|
@ -435,7 +462,7 @@ class Thread(object):
|
|||
if till is None:
|
||||
till = datetime.utcnow() + timedelta(seconds=timeout)
|
||||
else:
|
||||
_Log.error("Can not except both `timeout` and `till`")
|
||||
Log.error("Can not except both `timeout` and `till`")
|
||||
|
||||
children = copy(self.children)
|
||||
for c in children:
|
||||
|
@ -451,7 +478,7 @@ class Thread(object):
|
|||
self.synch_lock.wait(0.5)
|
||||
|
||||
if DEBUG:
|
||||
_Log.note("Waiting on thread {{thread|json}}", thread=self.name)
|
||||
Log.note("Waiting on thread {{thread|json}}", thread=self.name)
|
||||
else:
|
||||
self.stopped.wait_for_go(till=till)
|
||||
if self.stopped:
|
||||
|
@ -464,12 +491,12 @@ class Thread(object):
|
|||
|
||||
@staticmethod
|
||||
def run(name, target, *args, **kwargs):
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
|
||||
# ENSURE target HAS please_stop ARGUMENT
|
||||
if "please_stop" not in target.__code__.co_varnames:
|
||||
_Log.error("function must have please_stop argument for signalling emergency shutdown")
|
||||
Log.error("function must have please_stop argument for signalling emergency shutdown")
|
||||
|
||||
Thread.num_threads += 1
|
||||
|
||||
|
@ -478,7 +505,7 @@ class Thread(object):
|
|||
return output
|
||||
|
||||
@staticmethod
|
||||
def sleep(seconds=None, till=None, please_stop=None):
|
||||
def sleep(seconds=None, till=None, timeout=None, please_stop=None):
|
||||
|
||||
if please_stop is not None or isinstance(till, Signal):
|
||||
if isinstance(till, Signal):
|
||||
|
@ -487,6 +514,8 @@ class Thread(object):
|
|||
|
||||
if seconds is not None:
|
||||
till = datetime.utcnow() + timedelta(seconds=seconds)
|
||||
elif timeout is not None:
|
||||
till = datetime.utcnow() + timedelta(seconds=timeout.seconds)
|
||||
elif till is None:
|
||||
till = MAX_DATETIME
|
||||
|
||||
|
@ -528,9 +557,9 @@ class Thread(object):
|
|||
please_stop.on_go(lambda: MAIN_THREAD.stop())
|
||||
|
||||
if Thread.current() != MAIN_THREAD:
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
_Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
|
||||
Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
|
||||
|
||||
try:
|
||||
if allow_exit:
|
||||
|
@ -539,7 +568,7 @@ class Thread(object):
|
|||
_wait_for_interrupt(please_stop)
|
||||
except (KeyboardInterrupt, SystemExit), _:
|
||||
please_stop.go()
|
||||
_Log.alert("SIGINT Detected! Stopping...")
|
||||
Log.alert("SIGINT Detected! Stopping...")
|
||||
|
||||
MAIN_THREAD.stop()
|
||||
|
||||
|
@ -607,7 +636,7 @@ class Signal(object):
|
|||
try:
|
||||
j()
|
||||
except Exception, e:
|
||||
_Log.warning("Trigger on Signal.go() failed!", e)
|
||||
Log.warning("Trigger on Signal.go() failed!", e)
|
||||
|
||||
def is_go(self):
|
||||
"""
|
||||
|
@ -642,7 +671,7 @@ class ThreadedQueue(Queue):
|
|||
period=None, # MAX TIME BETWEEN FLUSHES TO SLOWER QUEUE
|
||||
silent=False # WRITES WILL COMPLAIN IF THEY ARE WAITING TOO LONG
|
||||
):
|
||||
if not _Log:
|
||||
if not Log:
|
||||
_late_import()
|
||||
|
||||
batch_size = coalesce(batch_size, int(coalesce(max_size, 0) / 2), 900)
|
||||
|
@ -688,7 +717,7 @@ class ThreadedQueue(Queue):
|
|||
_buffer.append(item)
|
||||
|
||||
except Exception, e:
|
||||
_Log.warning(
|
||||
Log.warning(
|
||||
"Unexpected problem",
|
||||
name=name,
|
||||
cause=e
|
||||
|
@ -706,7 +735,7 @@ class ThreadedQueue(Queue):
|
|||
next_time = now + bit_more_time
|
||||
|
||||
except Exception, e:
|
||||
_Log.warning(
|
||||
Log.warning(
|
||||
"Problem with {{name}} pushing {{num}} items to data sink",
|
||||
name=name,
|
||||
num=len(_buffer),
|
||||
|
@ -717,7 +746,7 @@ class ThreadedQueue(Queue):
|
|||
# ONE LAST PUSH, DO NOT HAVE TIME TO DEAL WITH ERRORS
|
||||
queue.extend(_buffer)
|
||||
|
||||
self.thread = Thread.run("threaded queue for " + name, worker_bee, parent_thread=self)
|
||||
self.thread = Thread.run("threaded queue for " + name, worker_bee)
|
||||
|
||||
def add(self, value):
|
||||
with self.lock:
|
||||
|
@ -776,15 +805,46 @@ def _wait_for_exit(please_stop):
|
|||
cr_count = -1000000 # NOT /dev/null
|
||||
|
||||
if strings.strip(line) == "exit":
|
||||
_Log.alert("'exit' Detected! Stopping...")
|
||||
Log.alert("'exit' Detected! Stopping...")
|
||||
return
|
||||
|
||||
|
||||
def _wait_for_interrupt(please_stop):
|
||||
while not please_stop:
|
||||
if DEBUG:
|
||||
_Log.note("inside wait-for-shutdown loop")
|
||||
Log.note("inside wait-for-shutdown loop")
|
||||
try:
|
||||
Thread.sleep(please_stop=please_stop)
|
||||
except Exception, _:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class Till(Signal):
|
||||
"""
|
||||
MANAGE THE TIMEOUT LOGIC
|
||||
"""
|
||||
def __init__(self, till=None, timeout=None, seconds=None):
|
||||
Signal.__init__(self)
|
||||
|
||||
timers = []
|
||||
|
||||
def go():
|
||||
self.go()
|
||||
for t in timers:
|
||||
t.cancel()
|
||||
|
||||
if isinstance(till, Date):
|
||||
t = threading.Timer((till - Date.now()).seconds, go)
|
||||
t.start()
|
||||
timers.append(t)
|
||||
if timeout:
|
||||
t = threading.Timer(timeout.seconds, go)
|
||||
t.start()
|
||||
timers.append(t)
|
||||
if seconds:
|
||||
t = threading.Timer(seconds, go)
|
||||
t.start()
|
||||
timers.append(t)
|
||||
if isinstance(till, Signal):
|
||||
till.on_go(go)
|
||||
|
|
|
@ -12,6 +12,7 @@ from __future__ import division
|
|||
from __future__ import absolute_import
|
||||
|
||||
import datetime
|
||||
from decimal import Decimal
|
||||
|
||||
from pyLibrary import regex
|
||||
from pyLibrary.vendor.dateutil.relativedelta import relativedelta
|
||||
|
@ -20,14 +21,12 @@ from pyLibrary.maths import Math
|
|||
from pyLibrary.dot import wrap
|
||||
|
||||
|
||||
_Date = None
|
||||
_Log = None
|
||||
|
||||
|
||||
Date = None
|
||||
Log = None
|
||||
def _delayed_import():
|
||||
global _Date
|
||||
from pyLibrary.times.dates import Date as _Date
|
||||
_ = _Date(None)
|
||||
global Date
|
||||
from pyLibrary.times.dates import Date
|
||||
_ = Date(None)
|
||||
|
||||
|
||||
class Duration(object):
|
||||
|
@ -71,7 +70,7 @@ class Duration(object):
|
|||
@staticmethod
|
||||
def range(start, stop, step):
|
||||
if not step:
|
||||
_Log.error("Expecting a non-zero duration for interval")
|
||||
Log.error("Expecting a non-zero duration for interval")
|
||||
output = []
|
||||
c = start
|
||||
while c < stop:
|
||||
|
@ -88,12 +87,12 @@ class Duration(object):
|
|||
return output
|
||||
|
||||
def __radd__(self, other):
|
||||
if not _Date:
|
||||
if not Date:
|
||||
_delayed_import()
|
||||
|
||||
if isinstance(other, datetime.datetime):
|
||||
return _Date(other).add(self)
|
||||
elif isinstance(other, _Date):
|
||||
return Date(other).add(self)
|
||||
elif isinstance(other, Date):
|
||||
return other.add(self)
|
||||
return self + other
|
||||
|
||||
|
@ -212,10 +211,10 @@ class Duration(object):
|
|||
|
||||
@property
|
||||
def seconds(self):
|
||||
return self.milli / 1000.0
|
||||
return float(self.milli) / 1000.0
|
||||
|
||||
def total_seconds(self):
|
||||
return self.milli / 1000.0
|
||||
return float(self.milli) / 1000.0
|
||||
|
||||
def __str__(self):
|
||||
return str(self.__unicode__())
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,120 +0,0 @@
|
|||
{
|
||||
"production_es": {
|
||||
"description": "pointer to es with known good results",
|
||||
"host": "http://elasticsearch7.metrics.scl3.mozilla.com",
|
||||
"port": "9200",
|
||||
"index": "bugs",
|
||||
"type": "bug_version",
|
||||
"debug": true
|
||||
},
|
||||
"public_bugs_reference": {
|
||||
"description": "pointer to es with known good *public* results",
|
||||
"filename": "./tests/resources/public_bugs_reference_es.json"
|
||||
},
|
||||
"public_comments_reference": {
|
||||
"description": "pointer to es with known good public comments",
|
||||
"filename": "./tests/resources/public_comments_reference_es.json"
|
||||
},
|
||||
"private_bugs_reference": {
|
||||
"description": "pointer to es with known good results",
|
||||
"filename": "./tests/resources/private_bugs_reference_es.json"
|
||||
},
|
||||
"private_comments_reference": {
|
||||
"description": "pointer to es with known good private comments",
|
||||
"filename": "./tests/resources/private_comments_reference_es.json"
|
||||
},
|
||||
"candidate": {
|
||||
"description": "pointer to es with test results",
|
||||
"filename": "./tests/results/test_results.json",
|
||||
"host": "http://localhost",
|
||||
"port": "9200",
|
||||
"index": "test_bugs",
|
||||
"type": "bug_version"
|
||||
},
|
||||
"fake":{
|
||||
//FOR TESTING JSON CREATION, NO NEED FOR REAL ES
|
||||
"bugs": {
|
||||
"filename":"./tests/results/test_bugs.json"
|
||||
},
|
||||
"comments": {
|
||||
"filename":"./tests/results/test_comments.json"
|
||||
}
|
||||
},
|
||||
"real":{
|
||||
//FOR TESTING INCREMENTAL ETL (AND GENERAL INTERACTION WITH A REAL ES)
|
||||
"bugs": {
|
||||
"host": "http://localhost",
|
||||
"port": "9200",
|
||||
"index": "test_bugs",
|
||||
"type": "bug_version",
|
||||
"schema_file": "./resources/json/bug_version.json",
|
||||
"debug": true
|
||||
},
|
||||
"comments": {
|
||||
"host": "http://localhost",
|
||||
"port": "9200",
|
||||
"index": "test_comments",
|
||||
"type": "bug_version",
|
||||
"schema_file": "./resources/json/bug_comments.json",
|
||||
"debug": true
|
||||
}
|
||||
},
|
||||
"param": {
|
||||
"increment": 10000,
|
||||
"bugs": [ 384, 1108, 1045, 1046, 1157, 1877, 1865, 1869,
|
||||
2586, 3140, 6810, 9622, 10575, 11040, 12911, 67742,
|
||||
96421, 123203, 178960, 367518, 457765, 458397, 471427, 544327,
|
||||
547727, 643420, 692436, 726635, 813650
|
||||
// ADD 372836 (REVIEW FLAGS TEST)
|
||||
// 13534 (REVIEW MOVES TO OTHER PERSON)
|
||||
// 393845 added blocking1.9+ twice
|
||||
// 671185 *many* review requests
|
||||
// 937428 whitespace after comma in user story, complex diff
|
||||
// 248970 another cutoff review request
|
||||
|
||||
],
|
||||
"alias_increment": 1000000,
|
||||
"alias_file": {
|
||||
"path": "./resources/json/bugzilla_aliases.json"
|
||||
},
|
||||
"temp_dir": "./tests/resources",
|
||||
"errors": "./tests/results/errors",
|
||||
"allow_private_bugs": true,
|
||||
"last_run_time": "./tests/results/last_run_time.txt",
|
||||
"first_run_time": "./tests/results/first_run_time.txt"
|
||||
},
|
||||
"bugzilla": {
|
||||
"filename": "./tests/resources/sql/small_bugzilla.sql",
|
||||
"preamble": "from https://github.com/klahnakoski/Bugzilla-ETL",
|
||||
"host": "localhost",
|
||||
"port": 3306,
|
||||
"username": "user",
|
||||
"password": "password",
|
||||
"schema": "test_bugzilla",
|
||||
"expires_on": 1372867005000,
|
||||
"debug": false
|
||||
},
|
||||
"debug": {
|
||||
"profile": false,
|
||||
"trace": false,
|
||||
"log": [
|
||||
{
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": "./tests/results/logs/test_etl.log",
|
||||
"maxBytes": 10000000,
|
||||
"backupCount": 200,
|
||||
"encoding": "utf8"
|
||||
},
|
||||
{
|
||||
"log_type": "stream",
|
||||
"stream": "sys.stdout"
|
||||
},
|
||||
{
|
||||
"log_type": "elasticsearch",
|
||||
"host": "http://klahnakoski-es.corp.tor1.mozilla.com",
|
||||
"index": "debug",
|
||||
"type": "bz_etl"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def get_resources(source, destination):
|
|||
|
||||
setup(
|
||||
name='Bugzilla-ETL',
|
||||
version="0.3.13353",
|
||||
version="2.0.13353",
|
||||
description='Mozilla Bugzilla Bug Version ETL',
|
||||
long_description=long_desc,
|
||||
author='Kyle Lahnakoski',
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -6,12 +6,12 @@ from pymysql.times import TimeDelta
|
|||
from bzETL.extract_bugzilla import SCREENED_WHITEBOARD_BUG_GROUPS
|
||||
from pyLibrary.env import startup, elasticsearch
|
||||
from pyLibrary import struct
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.env.emailer import Emailer
|
||||
from pyLibrary.env.logs import Log, extract_stack
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.struct import nvl, Struct
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.struct import coalesce, Dict
|
||||
|
||||
# WRAP Log.error TO SHOW THE SPECIFIC ERROR IN THE LOGFILE
|
||||
if not hasattr(Log, "old_error"):
|
||||
|
@ -26,7 +26,7 @@ if not hasattr(Log, "old_error"):
|
|||
##ASSIGN AS CLASS METHOD
|
||||
Log.error=MethodType(new_error, Log)
|
||||
|
||||
NOW = CNV.datetime2milli(datetime.utcnow())
|
||||
NOW = convert.datetime2milli(datetime.utcnow())
|
||||
A_WHILE_AGO = int(NOW - TimeDelta(minutes=10).total_seconds()*1000)
|
||||
|
||||
|
||||
|
@ -58,7 +58,7 @@ class TestLookForLeaks(unittest.TestCase):
|
|||
"facets": {"0": {"statistical": {"field": "bug_id"}}}
|
||||
}).facets["0"].max
|
||||
|
||||
return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
|
||||
return reversed(list(qb.intervals(0, max_bug_id, self.settings.param.increment)))
|
||||
|
||||
def test_private_bugs_not_leaking(self):
|
||||
bad_news = False
|
||||
|
@ -103,9 +103,9 @@ class TestLookForLeaks(unittest.TestCase):
|
|||
|
||||
Log.note("{{num}} leaks!! {{bugs}}", {
|
||||
"num": len(leaked_bugs),
|
||||
"bugs": Q.run({
|
||||
"bugs": qb.run({
|
||||
"from":leaked_bugs,
|
||||
"select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}],
|
||||
"select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: convert.datetime2string(convert.milli2datetime(d.modified_ts))}],
|
||||
"sort":"bug_id"
|
||||
})
|
||||
})
|
||||
|
@ -170,7 +170,7 @@ class TestLookForLeaks(unittest.TestCase):
|
|||
fields=["bug_id", "bug_group", "attachments", "modified_ts"]
|
||||
)
|
||||
|
||||
private_attachments = Q.run({
|
||||
private_attachments = qb.run({
|
||||
"from": bugs_w_private_attachments,
|
||||
"select": "attachments.attach_id",
|
||||
"where": {"or": [
|
||||
|
@ -181,7 +181,7 @@ class TestLookForLeaks(unittest.TestCase):
|
|||
try:
|
||||
private_attachments = [int(v) for v in private_attachments]
|
||||
except Exception, e:
|
||||
private_attachments = Q.run({
|
||||
private_attachments = qb.run({
|
||||
"from": bugs_w_private_attachments,
|
||||
"select": "attachments.attach_id",
|
||||
"where": {"or": [
|
||||
|
@ -263,29 +263,29 @@ class TestLookForLeaks(unittest.TestCase):
|
|||
|
||||
if leaked_whiteboard:
|
||||
for l in leaked_whiteboard:
|
||||
l.modified_ts=CNV.datetime2string(CNV.milli2datetime(l.modified_ts))
|
||||
l.modified_ts=convert.datetime2string(convert.milli2datetime(l.modified_ts))
|
||||
|
||||
Log.error("Whiteboard leaking:\n{{leak|indent}}", {"leak": leaked_whiteboard})
|
||||
|
||||
|
||||
def get(es, esfilter, fields=None, limit=None):
|
||||
query = struct.wrap({
|
||||
query = wrap({
|
||||
"query": {"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": esfilter
|
||||
}},
|
||||
"from": 0,
|
||||
"size": nvl(limit, 200000),
|
||||
"size": coalesce(limit, 200000),
|
||||
"sort": []
|
||||
})
|
||||
|
||||
if fields:
|
||||
query.fields=fields
|
||||
results = es.search(query)
|
||||
return Q.select(results.hits.hits, "fields")
|
||||
return qb.select(results.hits.hits, "fields")
|
||||
else:
|
||||
results = es.search(query)
|
||||
return Q.select(results.hits.hits, "_source")
|
||||
return qb.select(results.hits.hits, "_source")
|
||||
|
||||
|
||||
|
||||
|
@ -300,8 +300,8 @@ def milli2datetime(r):
|
|||
elif isinstance(r, basestring):
|
||||
return r
|
||||
elif Math.is_number(r):
|
||||
if CNV.value2number(r) > 800000000000:
|
||||
return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
|
||||
if convert.value2number(r) > 800000000000:
|
||||
return convert.datetime2string(convert.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
|
||||
else:
|
||||
return r
|
||||
elif isinstance(r, dict):
|
||||
|
@ -320,7 +320,7 @@ def milli2datetime(r):
|
|||
if not output:
|
||||
return None
|
||||
try:
|
||||
return Q.sort(output)
|
||||
return qb.sort(output)
|
||||
except Exception:
|
||||
return output
|
||||
else:
|
||||
|
@ -339,7 +339,7 @@ def main():
|
|||
if results.errors or results.failures:
|
||||
error(results)
|
||||
except Exception, e:
|
||||
error(Struct(errors=[e]))
|
||||
error(Dict(errors=[e]))
|
||||
finally:
|
||||
pass
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
from pyLibrary.sql.db import DB, SQL
|
||||
from pyLibrary.sql.db import MySQL, SQL
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.env import startup
|
||||
|
||||
def main():
|
||||
"""
|
||||
MEANT TO BE RUN JUST ONCE IN DEVELOPMENT TO CONVERT A BIG PUBLIC
|
||||
DATABASE (8G+) INTO A TINY TESTING DB (FOR ADDING TO REPOSITORY)
|
||||
DATABASE (8G+) INTO A TINY TESTING MySQL (FOR ADDING TO REPOSITORY)
|
||||
"""
|
||||
try:
|
||||
settings=startup.read_settings()
|
||||
|
@ -20,7 +20,7 @@ def main():
|
|||
Log.note("Scrubbing db of those pesky records.")
|
||||
Log.note("This is going to take hours ...")
|
||||
|
||||
DB.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", {
|
||||
MySQL.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", {
|
||||
"schema":settings.bugzilla.schema,
|
||||
"bug_list":SQL(settings.param.bugs)
|
||||
})
|
||||
|
|
|
@ -1,19 +1,19 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
from pyLibrary import struct
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.env.files import File
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.env import startup
|
||||
|
||||
|
||||
def main(settings):
|
||||
file = File(settings.param.alias_file)
|
||||
aliases = CNV.JSON2object(file.read())
|
||||
aliases = convert.json2value(file.read())
|
||||
|
||||
for v in aliases.values():
|
||||
v.candidates = CNV.dict2Multiset(v.candidates)
|
||||
v.candidates = convert.dict2Multiset(v.candidates)
|
||||
|
||||
data = [
|
||||
{
|
||||
|
@ -24,7 +24,7 @@ def main(settings):
|
|||
if d.canonical != None and n != d.canonical
|
||||
]
|
||||
|
||||
sorted = Q.sort(data, "found")
|
||||
sorted = qb.sort(data, "found")
|
||||
for s in sorted:
|
||||
Log.note("{{found}} == {{lost}}", s)
|
||||
|
||||
|
@ -35,11 +35,11 @@ def main(settings):
|
|||
}
|
||||
|
||||
rev_clean = struct.inverse(clean)
|
||||
Log.note(CNV.object2JSON(rev_clean, pretty=True))
|
||||
Log.note(convert.value2json(rev_clean, pretty=True))
|
||||
|
||||
for k, v in rev_clean.items():
|
||||
if len(v) > 3:
|
||||
Log.note(CNV.object2JSON({k: v}, pretty=True))
|
||||
Log.note(convert.value2json({k: v}, pretty=True))
|
||||
|
||||
|
||||
def start():
|
||||
|
|
|
@ -24,6 +24,9 @@ WHERE
|
|||
;
|
||||
COMMIT;
|
||||
|
||||
|
||||
|
||||
|
||||
START TRANSACTION;
|
||||
DELETE FROM
|
||||
tracking_flags_bugs
|
||||
|
@ -182,6 +185,8 @@ INSERT INTO keep_profiles SELECT watch_user FROM components;
|
|||
|
||||
DELETE FROM keep_profiles WHERE id IS NULL;
|
||||
DELETE FROM profiles WHERE userid NOT IN (SELECT DISTINCT id FROM keep_profiles);
|
||||
DELETE FROM bug_mentors
|
||||
|
||||
DROP TABLE IF EXISTS keep_profiles;
|
||||
UPDATE profiles SET public_key=NULL;
|
||||
COMMIT;
|
||||
|
@ -387,5 +392,10 @@ DELETE FROM whine_schedules;
|
|||
DELETE FROM quips;
|
||||
COMMIT;
|
||||
|
||||
|
||||
START TRANSACTION ;
|
||||
DELETE FROM
|
||||
|
||||
|
||||
SET foreign_key_checks = 1;
|
||||
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -10,39 +10,38 @@
|
|||
|
||||
from datetime import datetime
|
||||
import unittest
|
||||
|
||||
from bzETL import extract_bugzilla, bz_etl
|
||||
from bzETL.bz_etl import etl
|
||||
from bzETL.extract_bugzilla import get_current_time, SCREENED_WHITEBOARD_BUG_GROUPS
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.collections import MIN
|
||||
from pyLibrary.queries.db_query import esfilter2sqlwhere
|
||||
from pyLibrary.sql.db import DB, all_db
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.env.elasticsearch import ElasticSearch
|
||||
from pyLibrary.debugs import startup, constants
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict, Null, wrap
|
||||
from pyLibrary.env.files import File
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.maths.randoms import Random
|
||||
from pyLibrary.env import startup
|
||||
from pyLibrary import struct
|
||||
from pyLibrary.struct import Struct, Null
|
||||
from pyLibrary.queries import qb
|
||||
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
|
||||
from pyLibrary.sql.mysql import MySQL, all_db
|
||||
from pyLibrary.testing import elasticsearch
|
||||
from pyLibrary.thread.threads import ThreadedQueue, Thread
|
||||
from pyLibrary.times.timer import Timer
|
||||
from util import compare_es, database
|
||||
from util import database, compare_es
|
||||
from util.compare_es import get_all_bug_versions
|
||||
from util.database import diff
|
||||
|
||||
|
||||
BUG_GROUP_FOR_TESTING = "super secret"
|
||||
|
||||
|
||||
class TestETL(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.settings = startup.read_settings(filename="test_settings.json")
|
||||
self.settings = startup.read_settings(filename="./tests/resources/config/test_settings.json")
|
||||
constants.set(self.settings.constants)
|
||||
Log.start(self.settings.debug)
|
||||
|
||||
def tearDown(self):
|
||||
#CLOSE THE CACHED DB CONNECTIONS
|
||||
#CLOSE THE CACHED MySQL CONNECTIONS
|
||||
bz_etl.close_db_connections()
|
||||
|
||||
if all_db:
|
||||
|
@ -60,13 +59,13 @@ class TestETL(unittest.TestCase):
|
|||
# settings.param.allow_private_bugs = True
|
||||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference)
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, 0)
|
||||
|
||||
|
@ -74,7 +73,7 @@ class TestETL(unittest.TestCase):
|
|||
param.bug_list = self.settings.param.bugs
|
||||
param.allow_private_bugs = self.settings.param.allow_private_bugs
|
||||
|
||||
with ThreadedQueue(candidate, size=1000) as output:
|
||||
with ThreadedQueue("etl_queue", candidate, max_size=1000) as output:
|
||||
etl(db, output, param, please_stop=None)
|
||||
|
||||
#COMPARE ALL BUGS
|
||||
|
@ -91,18 +90,18 @@ class TestETL(unittest.TestCase):
|
|||
NUM_TO_TEST = 100
|
||||
MAX_BUG_ID = 900000
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
reference = ElasticSearch(self.settings.private_bugs_reference)
|
||||
reference = elasticsearch.Index(self.settings.private_bugs_reference)
|
||||
|
||||
#GO FASTER BY STORING LOCAL FILE
|
||||
local_cache = File(self.settings.param.temp_dir + "/private_bugs.json")
|
||||
if local_cache.exists:
|
||||
private_bugs = set(CNV.JSON2object(local_cache.read()))
|
||||
private_bugs = set(convert.json2value(local_cache.read()))
|
||||
else:
|
||||
with Timer("get private bugs"):
|
||||
private_bugs = compare_es.get_private_bugs(reference)
|
||||
local_cache.write(CNV.object2JSON(private_bugs))
|
||||
local_cache.write(convert.value2json(private_bugs))
|
||||
|
||||
while True:
|
||||
some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]
|
||||
|
@ -110,8 +109,8 @@ class TestETL(unittest.TestCase):
|
|||
Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs})
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, 0)
|
||||
param.alias_file = self.settings.param.alias_file
|
||||
|
@ -196,7 +195,7 @@ class TestETL(unittest.TestCase):
|
|||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
#MARK SOME BUGS PRIVATE
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
for b in private_bugs:
|
||||
database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)
|
||||
|
||||
|
@ -219,7 +218,7 @@ class TestETL(unittest.TestCase):
|
|||
bz_etl.main(self.settings, es, es_c)
|
||||
|
||||
#MARK SOME STUFF PRIVATE
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
#BUGS
|
||||
private_bugs = set(Random.sample(self.settings.param.bugs, 3))
|
||||
Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs})
|
||||
|
@ -259,7 +258,7 @@ class TestETL(unittest.TestCase):
|
|||
|
||||
#MARK SOME STUFF PUBLIC
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
for b in private_bugs:
|
||||
database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING)
|
||||
|
||||
|
@ -276,7 +275,7 @@ class TestETL(unittest.TestCase):
|
|||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
#MARK SOME STUFF PRIVATE
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
private_attachments = db.query("""
|
||||
SELECT
|
||||
bug_id,
|
||||
|
@ -305,7 +304,7 @@ class TestETL(unittest.TestCase):
|
|||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
#MARK SOME COMMENTS PRIVATE
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
private_comments = db.query("""
|
||||
SELECT
|
||||
bug_id,
|
||||
|
@ -341,7 +340,7 @@ class TestETL(unittest.TestCase):
|
|||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
#MARK SOME BUGS PRIVATE
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
for b in private_bugs:
|
||||
database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)
|
||||
|
||||
|
@ -350,7 +349,7 @@ class TestETL(unittest.TestCase):
|
|||
bz_etl.main(self.settings, es, es_c)
|
||||
|
||||
# MAKE A CHANGE TO THE PRIVATE BUGS
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
for b in private_bugs:
|
||||
old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0]
|
||||
new_bug = old_bug.copy()
|
||||
|
@ -370,15 +369,15 @@ class TestETL(unittest.TestCase):
|
|||
"query": {"match_all": {}},
|
||||
"filter": {"and": [
|
||||
{"terms": {"bug_id": private_bugs}},
|
||||
{"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}}
|
||||
{"range": {"expires_on": {"gte": convert.datetime2milli(now)}}}
|
||||
]}
|
||||
}},
|
||||
"from": 0,
|
||||
"size": 200000,
|
||||
"sort": []
|
||||
})
|
||||
latest_bugs = Q.select(results.hits.hits, "_source")
|
||||
latest_bugs_index = Q.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG
|
||||
latest_bugs = qb.select(results.hits.hits, "_source")
|
||||
latest_bugs_index = qb.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG
|
||||
|
||||
for bug_id in private_bugs:
|
||||
if latest_bugs_index[bug_id] == None:
|
||||
|
@ -396,18 +395,18 @@ class TestETL(unittest.TestCase):
|
|||
def test_incremental_etl_catches_tracking_flags(self):
|
||||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
# FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
|
||||
param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
|
||||
param.start_time = convert.datetime2milli(convert.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
|
||||
|
||||
param.alias_file = self.settings.param.alias_file
|
||||
param.bug_list = struct.wrap([813650])
|
||||
param.bug_list = wrap([813650])
|
||||
param.allow_private_bugs = self.settings.param.allow_private_bugs
|
||||
|
||||
with ThreadedQueue(es, size=1000) as output:
|
||||
|
@ -428,7 +427,7 @@ class TestETL(unittest.TestCase):
|
|||
|
||||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
|
||||
#MARK BUG AS ONE OF THE SCREENED GROUPS
|
||||
|
@ -436,13 +435,13 @@ class TestETL(unittest.TestCase):
|
|||
db.flush()
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, 0)
|
||||
|
||||
param.alias_file = self.settings.param.alias_file
|
||||
param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
|
||||
param.bug_list = wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
|
||||
param.allow_private_bugs = True
|
||||
|
||||
with ThreadedQueue(es, size=1000) as output:
|
||||
|
@ -460,7 +459,7 @@ class TestETL(unittest.TestCase):
|
|||
|
||||
database.make_test_instance(self.settings.bugzilla)
|
||||
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
|
||||
#MARK BUG AS ONE OF THE SCREENED GROUPS
|
||||
|
@ -470,13 +469,13 @@ class TestETL(unittest.TestCase):
|
|||
db.flush()
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, 0)
|
||||
|
||||
param.alias_file = self.settings.param.alias_file
|
||||
param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
|
||||
param.bug_list = wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
|
||||
param.allow_private_bugs = True
|
||||
|
||||
with ThreadedQueue(es, size=1000) as output:
|
||||
|
@ -491,13 +490,13 @@ class TestETL(unittest.TestCase):
|
|||
|
||||
def test_incremental_has_correct_expires_on(self):
|
||||
# 813650, 726635 BOTH HAVE CHANGES IN 2013
|
||||
bugs = struct.wrap([813650, 726635])
|
||||
start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d"))
|
||||
bugs = wrap([813650, 726635])
|
||||
start_incremental=convert.datetime2milli(convert.string2datetime("2013-01-01", "%Y-%m-%d"))
|
||||
|
||||
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
#SETUP FIRST RUN PARAMETERS
|
||||
param = Struct()
|
||||
param = Dict()
|
||||
param.end_time = start_incremental
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
|
||||
|
@ -510,8 +509,8 @@ class TestETL(unittest.TestCase):
|
|||
etl(db, output, param, please_stop=None)
|
||||
|
||||
#SETUP INCREMENTAL RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(datetime.utcnow())
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(datetime.utcnow())
|
||||
param.start_time = start_incremental
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
|
||||
|
||||
|
@ -528,7 +527,7 @@ class TestETL(unittest.TestCase):
|
|||
"query": {"match_all": {}},
|
||||
"filter": {"and":[
|
||||
{"term":{"bug_id":b}},
|
||||
{"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}}
|
||||
{"range":{"expires_on":{"gte":convert.datetime2milli(datetime.utcnow())}}}
|
||||
]}
|
||||
}},
|
||||
"from": 0,
|
||||
|
@ -564,13 +563,13 @@ def verify_public_bugs(es, private_bugs):
|
|||
|
||||
def verify_no_private_attachments(es, private_attachments):
|
||||
#VERIFY ATTACHMENTS ARE NOT IN OUTPUT
|
||||
for b in Q.select(private_attachments, "bug_id"):
|
||||
for b in qb.select(private_attachments, "bug_id"):
|
||||
versions = compare_es.get_all_bug_versions(es, b)
|
||||
#WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
|
||||
#BELONGS TO, IF AT ALL
|
||||
for v in versions:
|
||||
for a in v.attachments:
|
||||
if a.attach_id in Q.select(private_attachments, "attach_id"):
|
||||
if a.attach_id in qb.select(private_attachments, "attach_id"):
|
||||
Log.error("Private attachment should not exist")
|
||||
|
||||
|
||||
|
@ -587,7 +586,7 @@ def verify_no_private_comments(es, private_comments):
|
|||
"sort": []
|
||||
})
|
||||
|
||||
if Q.select(data.hits.hits, "_source"):
|
||||
if qb.select(data.hits.hits, "_source"):
|
||||
Log.error("Expecting no comments")
|
||||
|
||||
|
||||
|
@ -601,25 +600,25 @@ def compare_both(candidate, reference, settings, some_bugs):
|
|||
found_errors = False
|
||||
for bug_id in some_bugs:
|
||||
try:
|
||||
versions = Q.sort(
|
||||
versions = qb.sort(
|
||||
get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
|
||||
"modified_ts")
|
||||
# WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
|
||||
if not versions:
|
||||
max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
|
||||
max_time = convert.milli2datetime(settings.bugzilla.expires_on)
|
||||
else:
|
||||
max_time = CNV.milli2datetime(versions.last().modified_ts)
|
||||
max_time = convert.milli2datetime(versions.last().modified_ts)
|
||||
|
||||
pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time)
|
||||
ref_versions = \
|
||||
Q.sort(
|
||||
qb.sort(
|
||||
#ADDED TO FIX OLD PRODUCTION BUG VERSIONS
|
||||
[compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
|
||||
"modified_ts"
|
||||
)
|
||||
|
||||
can = CNV.object2JSON(versions, pretty=True)
|
||||
ref = CNV.object2JSON(ref_versions, pretty=True)
|
||||
can = convert.value2json(versions, pretty=True)
|
||||
ref = convert.value2json(ref_versions, pretty=True)
|
||||
if can != ref:
|
||||
found_errors = True
|
||||
File(try_dir + unicode(bug_id) + ".txt").write(can)
|
||||
|
|
|
@ -8,14 +8,14 @@
|
|||
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
|
||||
#
|
||||
import unittest
|
||||
from bzETL import extract_bugzilla, bz_etl
|
||||
from bzETL import bz_etl, extract_bugzilla
|
||||
from bzETL.bz_etl import etl
|
||||
from bzETL.extract_bugzilla import get_current_time
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.sql.db import DB, all_db
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.env import startup
|
||||
from pyLibrary.struct import Struct
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs import startup
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict
|
||||
from pyLibrary.sql.mysql import all_db, MySQL
|
||||
from pyLibrary.testing import elasticsearch
|
||||
from pyLibrary.thread.threads import ThreadedQueue
|
||||
|
||||
|
@ -31,7 +31,7 @@ class TestOneETL(unittest.TestCase):
|
|||
|
||||
|
||||
def tearDown(self):
|
||||
#CLOSE THE CACHED DB CONNECTIONS
|
||||
#CLOSE THE CACHED MySQL CONNECTIONS
|
||||
bz_etl.close_db_connections()
|
||||
|
||||
if all_db:
|
||||
|
@ -45,12 +45,12 @@ class TestOneETL(unittest.TestCase):
|
|||
USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
|
||||
THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
|
||||
"""
|
||||
with DB(self.settings.bugzilla) as db:
|
||||
with MySQL(self.settings.bugzilla) as db:
|
||||
candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch)
|
||||
|
||||
#SETUP RUN PARAMETERS
|
||||
param = Struct()
|
||||
param.end_time = CNV.datetime2milli(get_current_time(db))
|
||||
param = Dict()
|
||||
param.end_time = convert.datetime2milli(get_current_time(db))
|
||||
param.start_time = 0
|
||||
param.start_time_str = extract_bugzilla.milli2string(db, 0)
|
||||
|
||||
|
@ -63,7 +63,7 @@ class TestOneETL(unittest.TestCase):
|
|||
|
||||
|
||||
#TODO: INCLUDE OPTION TO USE REAL ES (AND ENSURE REALLY WORKING)
|
||||
# es_settings=Struct(**{
|
||||
# es_settings=Dict(**{
|
||||
# "host": "http://localhost",
|
||||
# "port": "9200",
|
||||
# "index": ElasticSearch.proto_name("test_public_bugs"),
|
||||
|
|
|
@ -9,10 +9,10 @@
|
|||
#
|
||||
|
||||
from bzETL import replicate
|
||||
from pyLibrary.env import startup
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.env.elasticsearch import ElasticSearch
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs import startup
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.env import elasticsearch
|
||||
|
||||
|
||||
def test_replication():
|
||||
|
@ -20,10 +20,10 @@ def test_replication():
|
|||
settings=startup.read_settings(filename="replication_settings.json")
|
||||
Log.start(settings.debug)
|
||||
|
||||
source=ElasticSearch(settings.source)
|
||||
source=elasticsearch.Index(settings.source)
|
||||
destination=replicate.get_or_create_index(settings["destination"], source)
|
||||
|
||||
replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d"))
|
||||
replicate.replicate(source, destination, [537285], convert.string2datetime("19900101", "%Y%m%d"))
|
||||
finally:
|
||||
Log.stop()
|
||||
|
||||
|
|
|
@ -10,11 +10,10 @@
|
|||
from datetime import datetime
|
||||
|
||||
from bzETL import transform_bugzilla, parse_bug_history
|
||||
from pyLibrary import struct
|
||||
from pyLibrary.struct import nvl
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.dot import coalesce, unwrap
|
||||
from pyLibrary.maths import Math
|
||||
from pyLibrary.queries import Q
|
||||
from pyLibrary.queries import qb
|
||||
|
||||
|
||||
#PULL ALL BUG DOCS FROM ONE ES
|
||||
|
@ -22,14 +21,14 @@ from pyLibrary.times.timer import Timer
|
|||
|
||||
|
||||
def get_all_bug_versions(es, bug_id, max_time=None):
|
||||
max_time = nvl(max_time, datetime.max)
|
||||
max_time = coalesce(max_time, datetime.max)
|
||||
|
||||
data = es.search({
|
||||
"query": {"filtered": {
|
||||
"query": {"match_all": {}},
|
||||
"filter": {"and": [
|
||||
{"term": {"bug_id": bug_id}},
|
||||
{"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}}
|
||||
{"range": {"modified_ts": {"lte": convert.datetime2milli(max_time)}}}
|
||||
]}
|
||||
}},
|
||||
"from": 0,
|
||||
|
@ -37,7 +36,7 @@ def get_all_bug_versions(es, bug_id, max_time=None):
|
|||
"sort": []
|
||||
})
|
||||
|
||||
return Q.select(data.hits.hits, "_source")
|
||||
return qb.select(data.hits.hits, "_source")
|
||||
|
||||
|
||||
def get_private_bugs(es):
|
||||
|
@ -63,10 +62,10 @@ def get_private_bugs(es):
|
|||
output = set([])
|
||||
for bug in data.hits.hits:
|
||||
output.add(bug.fields.bug_id)
|
||||
output |= set(nvl(CNV.value2intlist(bug.fields.blocked), []))
|
||||
output |= set(nvl(CNV.value2intlist(bug.fields.dependson), []))
|
||||
output |= set(nvl(CNV.value2intlist(bug.fields.dupe_of), []))
|
||||
output |= set(nvl(CNV.value2intlist(bug.fields.dupe_by), []))
|
||||
output |= set(coalesce(convert.value2intlist(bug.fields.blocked), []))
|
||||
output |= set(coalesce(convert.value2intlist(bug.fields.dependson), []))
|
||||
output |= set(coalesce(convert.value2intlist(bug.fields.dupe_of), []))
|
||||
output |= set(coalesce(convert.value2intlist(bug.fields.dupe_by), []))
|
||||
|
||||
output.add(551988, 636964)
|
||||
return output
|
||||
|
@ -83,23 +82,23 @@ def old2new(bug, max_date):
|
|||
else:
|
||||
bug.everconfirmed = int(bug.everconfirmed)
|
||||
|
||||
bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))
|
||||
bug = convert.json2value(convert.value2json(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))
|
||||
|
||||
if bug.expires_on > max_date:
|
||||
bug.expires_on = parse_bug_history.MAX_TIME
|
||||
if bug.votes != None:
|
||||
bug.votes = int(bug.votes)
|
||||
bug.dupe_by = CNV.value2intlist(bug.dupe_by)
|
||||
bug.dupe_by = convert.value2intlist(bug.dupe_by)
|
||||
if bug.votes == 0:
|
||||
del bug["votes"]
|
||||
# if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
|
||||
# bug.remaining_time = 0
|
||||
if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
|
||||
bug.cf_due_date = CNV.datetime2milli(
|
||||
CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
|
||||
bug.cf_due_date = convert.datetime2milli(
|
||||
convert.string2datetime(bug.cf_due_date, "%Y-%m-%d")
|
||||
)
|
||||
bug.changes = CNV.JSON2object(
|
||||
CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
|
||||
bug.changes = convert.json2value(
|
||||
convert.value2json(qb.sort(bug.changes, "field_name")) \
|
||||
.replace("\"field_value_removed\":", "\"old_value\":") \
|
||||
.replace("\"field_value\":", "\"new_value\":")
|
||||
)
|
||||
|
@ -113,7 +112,7 @@ def old2new(bug, max_date):
|
|||
if Math.is_number(bug.cf_last_resolved):
|
||||
bug.cf_last_resolved = long(bug.cf_last_resolved)
|
||||
else:
|
||||
bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
|
||||
bug.cf_last_resolved = convert.datetime2milli(convert.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
|
||||
except Exception, e:
|
||||
pass
|
||||
|
||||
|
@ -123,15 +122,15 @@ def old2new(bug, max_date):
|
|||
if c.attach_id == '':
|
||||
c.attach_id = None
|
||||
else:
|
||||
c.attach_id = CNV.value2int(c.attach_id)
|
||||
c.attach_id = convert.value2int(c.attach_id)
|
||||
|
||||
bug.attachments = Q.sort(bug.attachments, "attach_id")
|
||||
bug.attachments = qb.sort(bug.attachments, "attach_id")
|
||||
for a in bug.attachments:
|
||||
a.attach_id = CNV.value2int(a.attach_id)
|
||||
a.attach_id = convert.value2int(a.attach_id)
|
||||
for k, v in list(a.items()):
|
||||
if k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate"):
|
||||
struct.unwrap(a)[k] = CNV.value2int(v) # PREVENT dot (.) INTERPRETATION
|
||||
a[k.split(".")[-1].split("_")[-1]] = CNV.value2int(v)
|
||||
unwrap(a)[k] = convert.value2int(v) # PREVENT dot (.) INTERPRETATION
|
||||
a[k.split(".")[-1].split("_")[-1]] = convert.value2int(v)
|
||||
|
||||
bug = transform_bugzilla.normalize(bug)
|
||||
return bug
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# encoding: utf-8
|
||||
#
|
||||
from bzETL.extract_bugzilla import milli2string, get_current_time
|
||||
from pyLibrary.cnv import CNV
|
||||
from pyLibrary.queries.db_query import esfilter2sqlwhere
|
||||
from pyLibrary.sql.db import DB
|
||||
from pyLibrary.env.logs import Log
|
||||
from pyLibrary.struct import Struct
|
||||
from pyLibrary import convert
|
||||
from pyLibrary.debugs.logs import Log
|
||||
from pyLibrary.dot import Dict
|
||||
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
|
||||
from pyLibrary.sql.mysql import MySQL
|
||||
from pyLibrary.times.timer import Timer
|
||||
|
||||
|
||||
|
@ -20,13 +20,13 @@ def make_test_instance(db_settings):
|
|||
Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema})
|
||||
no_schema=db_settings.copy()
|
||||
no_schema.schema = None
|
||||
with DB(no_schema) as db:
|
||||
with MySQL(no_schema) as db:
|
||||
db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)})
|
||||
db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)})
|
||||
|
||||
#FILL SCHEMA
|
||||
Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema})
|
||||
DB.execute_file(db_settings, db_settings.filename)
|
||||
MySQL.execute_file(filename=db_settings.filename, settings=db_settings)
|
||||
|
||||
except Exception, e:
|
||||
Log.error("Can not setup test database", e)
|
||||
|
@ -63,8 +63,8 @@ def add_bug_group(db, bug_id, group_name):
|
|||
group_id=group_exists[0].id
|
||||
|
||||
diff(db, "bugs",
|
||||
Struct(bug_id=bug_id, bug_group=None),
|
||||
Struct(bug_id=bug_id, bug_group=group_name)
|
||||
Dict(bug_id=bug_id, bug_group=None),
|
||||
Dict(bug_id=bug_id, bug_group=group_name)
|
||||
)
|
||||
db.insert("bug_group_map", {"bug_id":bug_id, "group_id":group_id})
|
||||
|
||||
|
@ -73,8 +73,8 @@ def remove_bug_group(db, bug_id, group_name):
|
|||
group_id=db.query("SELECT id FROM groups WHERE name={{name}}", {"name": group_name})[0].id
|
||||
|
||||
diff(db, "bugs",
|
||||
Struct(bug_id=bug_id, bug_group=group_name),
|
||||
Struct(bug_id=bug_id, bug_group=None)
|
||||
Dict(bug_id=bug_id, bug_group=group_name),
|
||||
Dict(bug_id=bug_id, bug_group=None)
|
||||
)
|
||||
db.execute("DELETE FROM bug_group_map WHERE bug_id={{bug_id}} and group_id={{group_id}}", {
|
||||
"bug_id":bug_id,
|
||||
|
@ -88,7 +88,7 @@ def diff(db, table, old_record, new_record):
|
|||
"""
|
||||
UPDATE bugs_activity WITH THE CHANGES IN RECORDS
|
||||
"""
|
||||
now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
|
||||
now = milli2string(db, convert.datetime2milli(get_current_time(db)))
|
||||
changed = set(old_record.keys()) ^ set(new_record.keys())
|
||||
changed |= set([k for k, v in old_record.items() if v != new_record[k]])
|
||||
|
||||
|
@ -103,7 +103,7 @@ def diff(db, table, old_record, new_record):
|
|||
if fieldid == None:
|
||||
Log.error("Expecting a valid field name")
|
||||
|
||||
activity = Struct(
|
||||
activity = Dict(
|
||||
bug_id=old_record.bug_id,
|
||||
who=1,
|
||||
bug_when=now,
|
||||
|
|
Загрузка…
Ссылка в новой задаче