move config files (no more secrets!)
each bug has own db connection
This commit is contained in:
Kyle Lahnakoski 2015-09-09 16:46:06 -04:00
Родитель 83c16b49c5
Коммит d4cb9d15c3
80 изменённых файлов: 3784 добавлений и 1954 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -15,3 +15,4 @@ build
dist
/pyLibrary/.svn
/results

Просмотреть файл

@ -1,12 +1,26 @@
# encoding: utf-8
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from bzETL.extract_bugzilla import get_all_cc_changes
from pyLibrary.env import startup, elasticsearch
from pyLibrary.cnv import CNV
from pyLibrary.queries.es_query import ESQuery
from pyLibrary.sql.db import DB
from pyLibrary.env.logs import Log
from pyLibrary.collections.multiset import Multiset
from pyLibrary.queries import Q
from pyLibrary.struct import nvl, set_default
from pyLibrary import convert
from pyLibrary.collections import Multiset
from pyLibrary.debugs import startup
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, coalesce
from pyLibrary.env import elasticsearch
from pyLibrary.queries import qb
from pyLibrary.queries.qb_usingES import FromES
from pyLibrary.sql.mysql import MySQL
def full_analysis(settings, bug_list=None, please_stop=None):
@ -24,18 +38,18 @@ def full_analysis(settings, bug_list=None, please_stop=None):
analyzer = AliasAnalyzer(settings.alias)
if bug_list:
with DB(settings.bugzilla, readonly=True) as db:
with MySQL(settings.bugzilla, readonly=True) as db:
data = get_all_cc_changes(db, bug_list)
analyzer.aggregator(data)
analyzer.analysis(True, please_stop)
return
with DB(settings.bugzilla, readonly=True) as db:
start = nvl(settings.alias.start, 0)
end = nvl(settings.alias.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
with MySQL(settings.bugzilla, readonly=True) as db:
start = coalesce(settings.alias.start, 0)
end = coalesce(settings.alias.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
#Perform analysis on blocks of bugs, in case we crash partway through
for s, e in Q.intervals(start, end, settings.alias.increment):
for s, e in qb.intervals(start, end, settings.alias.increment):
Log.note("Load range {{start}}-{{end}}", {
"start": s,
"end": e
@ -56,7 +70,7 @@ class AliasAnalyzer(object):
try:
a = set_default({}, settings.elasticsearch, {"type":"alias"})
self.es = elasticsearch.Cluster(settings.elasticsearch).get_or_create_index(a, ALIAS_SCHEMA, limit_replicas=True)
self.esq = ESQuery(self.es)
self.esq = FromES(self.es)
result = self.esq.query({
"from":"bug_aliases",
"select":["canonical", "alias"]
@ -69,7 +83,7 @@ class AliasAnalyzer(object):
# LOAD THE NON-MATCHES
na = set_default({}, settings.elasticsearch, {"type":"not_alias"})
es = elasticsearch.Cluster(na).get_or_create_index(na)
esq = ESQuery(es)
esq = FromES(es)
result = esq.query({
"from":"bug_aliases",
"select":["canonical", "alias"]
@ -110,7 +124,7 @@ class AliasAnalyzer(object):
if count < 0:
problem_agg.add(self.alias(email)["canonical"], amount=count)
problems = Q.sort([
problems = qb.sort([
{"email": e, "count": c}
for e, c in problem_agg.dic.iteritems()
if not self.not_aliases.get(e, None) and (c <= -(DIFF / 2) or last_run)
@ -126,7 +140,7 @@ class AliasAnalyzer(object):
for bug_id, agg in self.bugs.iteritems():
if agg.dic.get(problem.email, 0) < 0: #ONLY BUGS THAT ARE EXPERIENCING THIS problem
solution_agg += agg
solutions = Q.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])
solutions = qb.sort([{"email": e, "count": c} for e, c in solution_agg.dic.iteritems()], [{"field": "count", "sort": -1}, "email"])
if last_run and len(solutions) == 2 and solutions[0].count == -solutions[1].count:
#exact match
@ -140,7 +154,7 @@ class AliasAnalyzer(object):
"problem": problem.email,
"score": problem.count,
"solution": best_solution.email,
"matches": CNV.object2JSON(Q.select(solutions, "count")[:10:])
"matches": convert.value2json(qb.select(solutions, "count")[:10:])
})
try_again = True
self.add_alias(problem.email, best_solution.email)

Просмотреть файл

@ -1,6 +1,5 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
@ -8,26 +7,27 @@
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
# REPLACES THE KETTLE FLOW CONTROL PROGRAM, AND BASH SCRIPT
from pyLibrary.maths import Math
from pyLibrary import struct, jsons
from pyLibrary.env.logs import Log
from pyLibrary.struct import Struct, nvl
from bzETL import extract_bugzilla, transform_bugzilla, alias_analysis, parse_bug_history
from bzETL.extract_bugzilla import *
from bzETL.parse_bug_history import BugHistoryParser
from pyLibrary import jsons, convert
from pyLibrary.debugs import startup, constants
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import wrap, coalesce, Dict, listwrap, set_default
from pyLibrary.env import elasticsearch
from pyLibrary.env.elasticsearch import Cluster
from pyLibrary.env.files import File
from pyLibrary.env import startup
from pyLibrary.thread.threads import Queue, Thread, AllThread, Lock, ThreadedQueue
from pyLibrary.cnv import CNV
from pyLibrary.env.elasticsearch import ElasticSearch
from pyLibrary.queries import Q
from pyLibrary.sql.db import DB
from bzETL import parse_bug_history, transform_bugzilla, extract_bugzilla, alias_analysis
from pyLibrary.maths import Math
from pyLibrary.queries import qb
from pyLibrary.sql.mysql import MySQL
from pyLibrary.thread.threads import Lock, AllThread, Thread, Queue, ThreadedQueue
from pyLibrary.times.timer import Timer
from extract_bugzilla import get_private_bugs_for_delete, get_recent_private_attachments, get_recent_private_comments, get_comments, get_comments_by_id, get_recent_private_bugs, get_current_time, get_bugs, get_dependencies, get_flags, get_new_activities, get_bug_see_also, get_attachments, get_tracking_flags, get_keywords, get_cc, get_bug_groups, get_duplicates
from parse_bug_history import BugHistoryParser
db_cache_lock = Lock()
@ -55,14 +55,14 @@ def etl_comments(db, es, param, please_stop):
# CONNECTIONS ARE EXPENSIVE, CACHE HERE
with comment_db_cache_lock:
if not comment_db_cache:
comment_db = DB(db)
comment_db = MySQL(db.settings)
comment_db_cache.append(comment_db)
with comment_db_cache_lock:
Log.note("Read comments from database")
comments = get_comments(comment_db_cache[0], param)
for g, c in Q.groupby(comments, size=500):
for g, c in qb.groupby(comments, size=500):
with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}):
es.extend({"id": cc.comment_id, "value": cc} for cc in c)
@ -72,27 +72,35 @@ def etl(db, output_queue, param, please_stop):
PROCESS RANGE, AS SPECIFIED IN param AND PUSH
BUG VERSION RECORDS TO output_queue
"""
NUM_CONNECTIONS = 10
# CONNECTIONS ARE EXPENSIVE, CACHE HERE
# MAKING CONNECTIONS ARE EXPENSIVE, CACHE HERE
with db_cache_lock:
if not db_cache:
with Timer("open connections to db"):
for f in get_stuff_from_bugzilla:
db = DB(db)
db_cache.append(db)
for i in range(NUM_CONNECTIONS):
db_cache.append(MySQL(db.settings))
db_results = Queue(max=2**30)
with db_cache_lock:
# ASYMMETRIC MULTI THREADING TO GET RECORDS FROM DB
with AllThread() as all:
for i, f in enumerate(get_stuff_from_bugzilla):
def process(target, db, param, please_stop):
db_results.extend(target(db, param))
db_results = Queue(name="db results", max=2**30)
all.add(process, f, db_cache[i], param.copy())
def get_records_from_bugzilla(db, param, please_stop):
for get_stuff in get_stuff_from_bugzilla:
if please_stop:
break
db_results.extend(get_stuff(db, param))
with AllThread() as all:
with db_cache_lock:
# SPLIT TASK EVENLY, HAVE EACH BUG USE SAME CONNECTION FOR ALL DATA
size = Math.ceiling(float(len(param.bug_list))/float(10))
for g, bug_ids in qb.groupby(param.bug_list, size=size):
all.add(get_records_from_bugzilla, db_cache[g], set_default(
{"bug_list": bug_ids},
param
))
db_results.add(Thread.STOP)
sorted = Q.sort(db_results, [
sorted = qb.sort(db_results, [
"bug_id",
"_merge_order",
{"field": "modified_ts", "sort": -1},
@ -102,7 +110,7 @@ def etl(db, output_queue, param, please_stop):
process = BugHistoryParser(param, output_queue)
for s in sorted:
process.processRow(s)
process.processRow(struct.wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
process.processRow(wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
def run_both_etl(db, output_queue, es_comments, param):
@ -128,8 +136,8 @@ def setup_es(settings, db, es, es_comments):
# INCREMENTAL UPDATE; DO NOT MAKE NEW INDEX
last_run_time = long(File(settings.param.last_run_time).read())
if not es:
es = ElasticSearch(settings.es)
es_comments = ElasticSearch(settings.es_comments)
es = elasticsearch.Index(settings.es)
es_comments = elasticsearch.Index(settings.es_comments)
elif File(settings.param.first_run_time).exists:
# DO NOT MAKE NEW INDEX, CONTINUE INITIAL FILL
try:
@ -137,17 +145,17 @@ def setup_es(settings, db, es, es_comments):
current_run_time = long(File(settings.param.first_run_time).read())
if not es:
if not settings.es.alias:
temp = ElasticSearch(settings.es).get_proto(settings.es.index)
temp = Cluster(settings.es).get_proto(settings.es.index)
settings.es.alias = settings.es.index
settings.es.index = temp.last()
es = ElasticSearch(settings.es)
es = elasticsearch.Index(settings.es)
es.set_refresh_interval(1) #REQUIRED SO WE CAN SEE WHAT BUGS HAVE BEEN LOADED ALREADY
if not settings.es_comments.alias:
temp = ElasticSearch(settings.es_comments).get_proto(settings.es_comments.index)
temp = Cluster(settings.es_comments).get_proto(settings.es_comments.index)
settings.es_comments.alias = settings.es_comments.index
settings.es_comments.index = temp.last()
es_comments = ElasticSearch(settings.es_comments)
es_comments = elasticsearch.Index(settings.es_comments)
except Exception, e:
Log.warning("can not resume ETL, restarting", e)
File(settings.param.first_run_time).delete()
@ -160,23 +168,23 @@ def setup_es(settings, db, es, es_comments):
schema = File(settings.es.schema_file).read()
if transform_bugzilla.USE_ATTACHMENTS_DOT:
schema = schema.replace("attachments_", "attachments\\.")
schema=CNV.JSON2object(schema, paths=True)
schema=convert.json2value(schema, paths=True)
schema.settings=jsons.expand_dot(schema.settings)
if not settings.es.alias:
settings.es.alias = settings.es.index
settings.es.index = ElasticSearch.proto_name(settings.es.alias)
es = ElasticSearch.create_index(settings.es, schema, limit_replicas=True)
settings.es.index = Cluster.proto_name(settings.es.alias)
es = Cluster.create_index(settings.es, schema, limit_replicas=True)
# BUG COMMENTS
comment_schema = File(settings.es_comments.schema_file).read()
comment_schema=CNV.JSON2object(comment_schema, paths=True)
comment_schema=convert.json2value(comment_schema, paths=True)
comment_schema.settings=jsons.expand_dot(comment_schema.settings)
if not settings.es_comments.alias:
settings.es_comments.alias = settings.es_comments.index
settings.es_comments.index = ElasticSearch.proto_name(settings.es_comments.alias)
es_comments = ElasticSearch.create_index(settings.es_comments, comment_schema, limit_replicas=True)
settings.es_comments.index = Cluster.proto_name(settings.es_comments.alias)
es_comments = Cluster.create_index(settings.es_comments, comment_schema, limit_replicas=True)
File(settings.param.first_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
File(settings.param.first_run_time).write(unicode(convert.datetime2milli(current_run_time)))
return current_run_time, es, es_comments, last_run_time
@ -190,7 +198,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
#REMOVE PRIVATE BUGS
private_bugs = get_private_bugs_for_delete(db, param)
Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": sorted(private_bugs)})
for g, delete_bugs in Q.groupby(private_bugs, size=1000):
for g, delete_bugs in qb.groupby(private_bugs, size=1000):
still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}})
if still_existing:
Log.note("Ensure the following existing private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": sorted(still_existing)})
@ -212,7 +220,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
#REMOVE **RECENT** PRIVATE ATTACHMENTS
private_attachments = get_recent_private_attachments(db, param)
bugs_to_refresh = set(Q.select(private_attachments, "bug_id"))
bugs_to_refresh = set(qb.select(private_attachments, "bug_id"))
es.delete_record({"terms": {"bug_id": bugs_to_refresh}})
#REBUILD BUGS THAT GOT REMOVED
@ -234,7 +242,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
#REFRESH COMMENTS WITH PRIVACY CHANGE
private_comments = get_recent_private_comments(db, param)
comment_list = set(Q.select(private_comments, "comment_id")) | {0}
comment_list = set(qb.select(private_comments, "comment_id")) | {0}
es_comments.delete_record({"terms": {"comment_id": comment_list}})
changed_comments = get_comments_by_id(db, comment_list, param)
es_comments.extend({"id": c.comment_id, "value": c} for c in changed_comments)
@ -242,7 +250,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
#GET LIST OF CHANGED BUGS
with Timer("time to get changed bug list"):
if param.allow_private_bugs:
bug_list = Q.select(db.query("""
bug_list = qb.select(db.query("""
SELECT
b.bug_id
FROM
@ -253,7 +261,7 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
"start_time_str": param.start_time_str
}), u"bug_id")
else:
bug_list = Q.select(db.query("""
bug_list = qb.select(db.query("""
SELECT
b.bug_id
FROM
@ -286,10 +294,10 @@ def incremental_etl(settings, param, db, es, es_comments, output_queue):
def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_queue):
with Thread.run("alias_analysis", alias_analysis.full_analysis, settings=settings):
end = nvl(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
start = nvl(settings.param.start, 0)
end = coalesce(settings.param.end, db.query("SELECT max(bug_id)+1 bug_id FROM bugs")[0].bug_id)
start = coalesce(settings.param.start, 0)
if resume_from_last_run:
start = nvl(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))
start = coalesce(settings.param.start, Math.floor(get_max_bug_id(es), settings.param.increment))
#############################################################
## MAIN ETL LOOP
@ -297,7 +305,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
#TWO WORKERS IS MORE THAN ENOUGH FOR A SINGLE THREAD
# with Multithread([run_both_etl, run_both_etl]) as workers:
for min, max in Q.intervals(start, end, settings.param.increment):
for min, max in qb.intervals(start, end, settings.param.increment):
if settings.args.quick and min < end - settings.param.increment and min != 0:
#--quick ONLY DOES FIRST AND LAST BLOCKS
continue
@ -306,7 +314,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
#GET LIST OF CHANGED BUGS
with Timer("time to get {{min}}..{{max}} bug list", {"min":min, "max":max}):
if param.allow_private_bugs:
bug_list = Q.select(db.query("""
bug_list = qb.select(db.query("""
SELECT
b.bug_id
FROM
@ -320,7 +328,7 @@ def full_etl(resume_from_last_run, settings, param, db, es, es_comments, output_
"start_time_str": param.start_time_str
}), u"bug_id")
else:
bug_list = Q.select(db.query("""
bug_list = qb.select(db.query("""
SELECT
b.bug_id
FROM
@ -363,17 +371,17 @@ def main(settings, es=None, es_comments=None):
#MAKE HANDLES TO CONTAINERS
try:
with DB(settings.bugzilla, readonly=True) as db:
with MySQL(settings.bugzilla, readonly=True) as db:
current_run_time, es, es_comments, last_run_time = setup_es(settings, db, es, es_comments)
with ThreadedQueue(es, size=500, silent=True) as output_queue:
with ThreadedQueue(es, max_size=500, silent=True) as output_queue:
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
# DB WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
# MySQL WRITES ARE DELAYED, RESULTING IN UNORDERED bug_when IN bugs_activity (AS IS ASSUMED FOR bugs(delats_ts))
# THIS JITTER IS USUALLY NO MORE THAN ONE SECOND, BUT WE WILL GO BACK 60sec, JUST IN CASE.
# THERE ARE OCCASIONAL WRITES THAT ARE IN GMT, BUT SINCE THEY LOOK LIKE THE FUTURE, WE CAPTURE THEM
param.start_time = last_run_time - nvl(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK
param.start_time = last_run_time - coalesce(settings.param.look_back, 5 * 60 * 1000) # 5 MINUTE LOOK_BACK
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
param.alias_file = settings.param.alias_file
param.allow_private_bugs = settings.param.allow_private_bugs
@ -395,7 +403,7 @@ def main(settings, es=None, es_comments=None):
es.delete_all_but(settings.es_comments.alias, settings.es_comments.index)
es_comments.add_alias(settings.es_comments.alias)
File(settings.param.last_run_time).write(unicode(CNV.datetime2milli(current_run_time)))
File(settings.param.last_run_time).write(unicode(convert.datetime2milli(current_run_time)))
except Exception, e:
Log.error("Problem with main ETL loop", e)
finally:
@ -454,11 +462,13 @@ def get_max_bug_id(es):
def close_db_connections():
(globals()["db_cache"], temp) = ([], db_cache)
global db_cache, comment_db_cache
db_cache, temp = [], db_cache
for db in temp:
db.close()
(globals()["comment_db_cache"], temp) = ([], comment_db_cache)
comment_db_cache, temp = [], comment_db_cache
for db in temp:
db.close()
@ -476,10 +486,11 @@ def start():
"action": "store_true",
"dest": "restart"
}])
constants.set(settings.constants)
with startup.SingleInstance(flavor_id=settings.args.filename):
if settings.args.restart:
for l in struct.listwrap(settings.debug.log):
for l in listwrap(settings.debug.log):
if l.filename:
File(l.filename).parent.delete()
File(settings.param.first_run_time).delete()

Просмотреть файл

@ -1,26 +1,26 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# PYTHON VERSION OF https://github.com/mozilla-metrics/bugzilla_etl/blob/master/transformations/bugzilla_to_json.ktr
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from bzETL.parse_bug_history import MAX_TIME
from pyLibrary.cnv import CNV
from pyLibrary.queries.db_query import esfilter2sqlwhere
from pyLibrary.sql.db import SQL
from pyLibrary.env.logs import Log
from pyLibrary.queries import Q
from pyLibrary.struct import Struct
#ALL BUGS IN PRIVATE ETL HAVE SCREENED FIELDS
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict
from pyLibrary.queries import qb
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
from pyLibrary.sql import SQL
from pyLibrary.times.timer import Timer
#ALL BUGS IN PRIVATE ETL HAVE SCREENED FIELDS
SCREENED_FIELDDEFS = [
19, #bug_file_loc
24, #short_desc
@ -67,7 +67,7 @@ def get_current_time(db):
SELECT
UNIX_TIMESTAMP(now()) `value`
""")[0].value
return CNV.unix2datetime(output)
return convert.unix2datetime(output)
def milli2string(db, value):
@ -90,7 +90,7 @@ def get_screened_whiteboard(db):
groups = db.query("SELECT id FROM groups WHERE {{where}}", {
"where": esfilter2sqlwhere(db, {"terms": {"name": SCREENED_WHITEBOARD_BUG_GROUPS}})
})
globals()["SCREENED_BUG_GROUP_IDS"] = Q.select(groups, "id")
globals()["SCREENED_BUG_GROUP_IDS"] = qb.select(groups, "id")
def get_bugs_table_columns(db, schema_name):
@ -226,7 +226,7 @@ def get_bugs(db, param):
else:
return db.quote_column(col.column_name)
param.bugs_columns = Q.select(bugs_columns, "column_name")
param.bugs_columns = qb.select(bugs_columns, "column_name")
param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns]))
param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}})
param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [
@ -290,7 +290,7 @@ def get_bugs(db, param):
def flatten_bugs_record(r, output):
for field_name, value in r.items():
if value != "---":
newRow = Struct()
newRow = Dict()
newRow.bug_id = r.bug_id
newRow.modified_ts = r.modified_ts
newRow.modified_by = r.modified_by
@ -523,7 +523,7 @@ def flatten_attachments(data):
for k,v in r.items():
if k=="bug_id":
continue
output.append(Struct(
output.append(Dict(
bug_id=r.bug_id,
modified_ts=r.modified_ts,
modified_by=r.modified_by,

Просмотреть файл

@ -1,11 +1,11 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
# Workflow:
# Create the current state object
@ -37,20 +37,20 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
import re
import math
from pyLibrary import struct, strings
from pyLibrary import convert, strings
from pyLibrary.collections import MIN
from pyLibrary.strings import apply_diff
from pyLibrary.struct import nvl, StructList, unwrap, wrap
from pyLibrary.cnv import CNV
from pyLibrary.env.logs import Log
from pyLibrary.queries import Q
from pyLibrary.struct import Struct, Null
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Null, wrap, DictList, Dict, coalesce, unwrap, inverse
from pyLibrary.env.files import File
from transform_bugzilla import normalize, NUMERIC_FIELDS, MULTI_FIELDS, DIFF_FIELDS
from pyLibrary.queries import qb
from pyLibrary.strings import apply_diff
from bzETL.transform_bugzilla import normalize, NUMERIC_FIELDS, MULTI_FIELDS, DIFF_FIELDS
# Used to split a flag into (type, status [,requestee])
@ -76,7 +76,7 @@ MAX_TIME = 9999999999000
class BugHistoryParser():
def __init__(self, settings, output_queue):
self.aliases = Null
self.startNewBug(struct.wrap({"bug_id": 0, "modified_ts": 0, "_merge_order": 1}))
self.startNewBug(wrap({"bug_id": 0, "modified_ts": 0, "_merge_order": 1}))
self.prevActivityID = Null
self.prev_row = Null
self.settings = settings
@ -107,8 +107,8 @@ class BugHistoryParser():
# Bugzilla bug workaround - some values were truncated, introducing uncertainty / errors:
# https://bugzilla.mozilla.org/show_bug.cgi?id=55161
if row_in.field_name in TRUNC_FIELDS:
added = CNV.value2string(row_in.new_value)
removed = CNV.value2string(row_in.old_value)
added = convert.value2string(row_in.new_value)
removed = convert.value2string(row_in.old_value)
uncertain = False
if added in ["? ?", "?"]: # Unknown value extracted from a possibly truncated field
@ -131,7 +131,7 @@ class BugHistoryParser():
# Process the "uncertain" flag as an activity
# WE ARE GOING BACKWARDS IN TIME, SO MARKUP PAST
Log.note("[Bug {{bug_id}}]: PROBLEM Setting this bug to be uncertain.", {"bug_id": self.currBugID})
self.processBugsActivitiesTableItem(struct.wrap({
self.processBugsActivitiesTableItem(wrap({
"modified_ts": row_in.modified_ts,
"modified_by": row_in.modified_by,
"field_name": "uncertain",
@ -144,7 +144,7 @@ class BugHistoryParser():
return
# Treat timestamps as int values
new_value = CNV.value2int(row_in.new_value) if row_in.field_name.endswith("_ts") else row_in.new_value
new_value = convert.value2int(row_in.new_value) if row_in.field_name.endswith("_ts") else row_in.new_value
# Determine where we are in the bug processing workflow
@ -181,11 +181,11 @@ class BugHistoryParser():
def startNewBug(self, row_in):
self.prevBugID = row_in.bug_id
self.bugVersions = StructList()
self.bugVersionsMap = Struct()
self.currActivity = Struct()
self.currBugAttachmentsMap = Struct()
self.currBugState = Struct(
self.bugVersions = DictList()
self.bugVersionsMap = Dict()
self.currActivity = Dict()
self.currBugAttachmentsMap = Dict()
self.currBugState = Dict(
_id=BugHistoryParser.uid(row_in.bug_id, row_in.modified_ts),
bug_id=row_in.bug_id,
modified_ts=row_in.modified_ts,
@ -199,7 +199,7 @@ class BugHistoryParser():
#WE FORCE ADD ALL SETS, AND WE WILL scrub() THEM OUT LATER IF NOT USED
for f in MULTI_FIELDS:
self.currBugState[f] = set([])
self.currBugState.flags = StructList() #FLAGS ARE MULTI_FIELDS, BUT ARE ALSO STRUCTS, SO MUST BE IN AN ARRAY
self.currBugState.flags = DictList() #FLAGS ARE MULTI_FIELDS, BUT ARE ALSO STRUCTS, SO MUST BE IN AN ARRAY
if row_in._merge_order != 1:
# Problem: No entry found in the 'bugs' table.
@ -229,7 +229,7 @@ class BugHistoryParser():
if currActivityID != self.prevActivityID:
self.prevActivityID = currActivityID
self.currActivity = Struct(
self.currActivity = Dict(
_id=currActivityID,
modified_ts=row_in.modified_ts,
modified_by=row_in.modified_by,
@ -251,7 +251,7 @@ class BugHistoryParser():
"modified_ts": row_in.modified_ts,
"created_ts": row_in.created_ts,
"modified_by": row_in.modified_by,
"flags": StructList()
"flags": DictList()
}
self.currBugAttachmentsMap[unicode(row_in.attach_id)] = att
@ -292,7 +292,7 @@ class BugHistoryParser():
if currActivityID != self.prevActivityID:
self.currActivity = self.bugVersionsMap[currActivityID]
if self.currActivity == None:
self.currActivity = Struct(
self.currActivity = Dict(
_id=currActivityID,
modified_ts=row_in.modified_ts,
modified_by=row_in.modified_by,
@ -377,7 +377,7 @@ class BugHistoryParser():
def populateIntermediateVersionObjects(self):
# Make sure the self.bugVersions are in descending order by modification time.
# They could be mixed because of attachment activity
self.bugVersions = Q.sort(self.bugVersions, [
self.bugVersions = qb.sort(self.bugVersions, [
{"field": "modified_ts", "sort": -1}
])
@ -385,7 +385,7 @@ class BugHistoryParser():
prevValues = {}
currVersion = Null
# Prime the while loop with an empty next version so our first iteration outputs the initial bug state
nextVersion = Struct(_id=self.currBugState._id, changes=[])
nextVersion = Dict(_id=self.currBugState._id, changes=[])
flagMap = {}
# A monotonically increasing version number (useful for debugging)
@ -431,7 +431,7 @@ class BugHistoryParser():
mergeBugVersion = True
# Link this version to the next one (if there is a next one)
self.currBugState.expires_on = nvl(nextVersion.modified_ts, MAX_TIME)
self.currBugState.expires_on = coalesce(nextVersion.modified_ts, MAX_TIME)
# Copy all attributes from the current version into self.currBugState
for propName, propValue in currVersion.items():
@ -439,7 +439,7 @@ class BugHistoryParser():
# Now walk self.currBugState forward in time by applying the changes from currVersion
#BE SURE TO APPLY REMOVES BEFORE ADDS, JUST IN CASE BOTH HAPPENED TO ONE FIELD
changes = Q.sort(currVersion.changes, ["attach_id", "field_name", {"field": "old_value", "sort": -1}, "new_value"])
changes = qb.sort(currVersion.changes, ["attach_id", "field_name", {"field": "old_value", "sort": -1}, "new_value"])
currVersion.changes = changes
self.currBugState.changes = changes
@ -461,7 +461,7 @@ class BugHistoryParser():
continue
if DEBUG_CHANGES:
Log.note("Processing change: " + CNV.object2JSON(change))
Log.note("Processing change: " + convert.value2json(change))
target = self.currBugState
targetName = "currBugState"
attach_id = change.attach_id
@ -562,7 +562,7 @@ class BugHistoryParser():
def processFlagChange(self, target, change, modified_ts, modified_by):
if target.flags == None:
Log.note("[Bug {{bug_id}}]: PROBLEM processFlagChange called with unset 'flags'", {"bug_id": self.currBugState.bug_id})
target.flags = StructList()
target.flags = DictList()
addedFlags = BugHistoryParser.getMultiFieldValue("flags", change.new_value)
removedFlags = BugHistoryParser.getMultiFieldValue("flags", change.old_value)
@ -685,7 +685,7 @@ class BugHistoryParser():
if chosen_one != None:
for f in ["value", "request_status", "requestee"]:
chosen_one[f] = nvl(added_flag[f], chosen_one[f])
chosen_one[f] = coalesce(added_flag[f], chosen_one[f])
# We need to avoid later adding this flag twice, since we rolled an add into a delete.
@ -723,7 +723,7 @@ class BugHistoryParser():
# if flag==u'review?(bjacob@mozilla.co':
# Log.debug()
flagParts = Struct(
flagParts = Dict(
modified_ts=modified_ts,
modified_by=modified_by,
value=flag
@ -742,7 +742,7 @@ class BugHistoryParser():
def addValues(self, total, add, valueType, field_name, target):
if not add:
return total
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + CNV.object2JSON(someValues))
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + convert.value2json(someValues))
if field_name == "flags":
Log.error("use processFlags")
else:
@ -763,7 +763,7 @@ class BugHistoryParser():
self.currActivity.changes.append({
"field_name": field_name,
"new_value": Null,
"old_value": ", ".join(map(unicode, Q.sort(diff))),
"old_value": ", ".join(map(unicode, qb.sort(diff))),
"attach_id": target.attach_id
})
@ -780,7 +780,7 @@ class BugHistoryParser():
if valueType == "added" and remove:
self.currActivity.changes.append({
"field_name": field_name,
"new_value": u", ".join(map(unicode, Q.sort(remove))),
"new_value": u", ".join(map(unicode, qb.sort(remove))),
"old_value": Null,
"attach_id": target.attach_id
})
@ -800,8 +800,8 @@ class BugHistoryParser():
return output
elif field_name == "cc":
# MAP CANONICAL TO EXISTING (BETWEEN map_* AND self.aliases WE HAVE A BIJECTION)
map_total = struct.inverse({t: self.alias(t) for t in total})
map_remove = struct.inverse({r: self.alias(r) for r in remove})
map_total = inverse({t: self.alias(t) for t in total})
map_remove = inverse({r: self.alias(r) for r in remove})
# CANONICAL VALUES
c_total = set(map_total.keys())
c_remove = set(map_remove.keys())
@ -816,8 +816,8 @@ class BugHistoryParser():
"type": valueType,
"object": arrayDesc,
"field_name": field_name,
"missing": Q.sort(Q.map2set(diff, map_remove)),
"existing": Q.sort(total),
"missing": qb.sort(qb.map2set(diff, map_remove)),
"existing": qb.sort(total),
"candidates": {d: self.aliases.get(d, None) for d in diff},
"bug_id": self.currBugID
})
@ -879,18 +879,18 @@ class BugHistoryParser():
"diff": diff,
"output": output
})
final_removed = Q.map2set(removed, map_total)
final_removed = qb.map2set(removed, map_total)
if final_removed:
self.currActivity.changes.append({
"field_name": field_name,
"new_value": u", ".join(map(unicode, Q.sort(final_removed))),
"new_value": u", ".join(map(unicode, qb.sort(final_removed))),
"old_value": Null,
"attach_id": target.attach_id
})
except Exception, email:
Log.error("issues", email)
return Q.map2set(output, map_total)
return qb.map2set(output, map_total)
else:
removed = total & remove
diff = remove - total
@ -899,7 +899,7 @@ class BugHistoryParser():
if valueType == "added" and removed:
self.currActivity.changes.append({
"field_name": field_name,
"new_value": u", ".join(map(unicode, Q.sort(removed))),
"new_value": u", ".join(map(unicode, qb.sort(removed))),
"old_value": Null,
"attach_id": target.attach_id
})
@ -917,13 +917,13 @@ class BugHistoryParser():
return output
def processFlags(self, total, old_values, new_values, modified_ts, modified_by, target_type, target):
added_values = StructList() #FOR SOME REASON, REMOVAL BY OBJECT DOES NOT WORK, SO WE USE THIS LIST OF STRING VALUES
added_values = DictList() #FOR SOME REASON, REMOVAL BY OBJECT DOES NOT WORK, SO WE USE THIS LIST OF STRING VALUES
for v in new_values:
flag = BugHistoryParser.makeFlag(v, modified_ts, modified_by)
if flag.request_type == None:
Log.note("[Bug {{bug_id}}]: PROBLEM Unable to parse flag {{flag}} (caused by 255 char limit?)", {
"flag": CNV.value2quote(flag.value),
"flag": convert.value2quote(flag.value),
"bug_id": self.currBugID
})
continue
@ -940,7 +940,7 @@ class BugHistoryParser():
else:
Log.note("[Bug {{bug_id}}]: PROBLEM Unable to find {{type}} FLAG: {{object}}.{{field_name}}: (All {{missing}}" + " not in : {{existing}})", {
"type": target_type,
"object": nvl(target.attach_id, target.bug_id),
"object": coalesce(target.attach_id, target.bug_id),
"field_name": "flags",
"missing": v,
"existing": total,
@ -951,21 +951,21 @@ class BugHistoryParser():
if added_values:
self.currActivity.changes.append({
"field_name": "flags",
"new_value": ", ".join(Q.sort(added_values.value)),
"new_value": ", ".join(qb.sort(added_values.value)),
"old_value": Null,
"attach_id": target.attach_id
})
if not old_values:
return total
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + CNV.object2JSON(someValues))
# Log.note("[Bug {{bug_id}}]: Adding " + valueType + " " + fieldName + " values:" + convert.value2json(someValues))
for v in old_values:
total.append(BugHistoryParser.makeFlag(v, target.modified_ts, target.modified_by))
self.currActivity.changes.append({
"field_name": "flags",
"new_value": Null,
"old_value": ", ".join(Q.sort(old_values)),
"old_value": ", ".join(qb.sort(old_values)),
"attach_id": target.attach_id
})
@ -991,7 +991,7 @@ class BugHistoryParser():
def alias(self, name):
if name == None:
return Null
return nvl(self.aliases.get(name, Null).canonical, name)
return coalesce(self.aliases.get(name, Null).canonical, name)
def initializeAliases(self):
@ -1000,7 +1000,7 @@ class BugHistoryParser():
alias_json = File(self.settings.alias_file).read()
except Exception, e:
alias_json = "{}"
self.aliases = {k: struct.wrap(v) for k, v in CNV.JSON2object(alias_json).items()}
self.aliases = {k: wrap(v) for k, v in convert.json2value(alias_json).items()}
Log.note("{{num}} aliases loaded", {"num": len(self.aliases.keys())})

Просмотреть файл

@ -1,6 +1,5 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
@ -8,29 +7,32 @@
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from datetime import datetime, timedelta
from pyLibrary.collections import MIN
from pyLibrary.struct import nvl, Struct
from pyLibrary.thread.threads import ThreadedQueue
from pyLibrary.times.timer import Timer
import transform_bugzilla
from pyLibrary.cnv import CNV
from pyLibrary.env.logs import Log
from pyLibrary.queries import Q
from pyLibrary.env import startup
from pyLibrary.env.files import File
from pyLibrary.collections.multiset import Multiset
from pyLibrary.env.elasticsearch import ElasticSearch
#
# REPLICATION
#
# Replication has a few benefits:
# 1) The slave can have scripting enabled, allowing more powerful set of queries
# 2) Physical proximity increases the probability of reduced latency
# 2) Physical proximity reduces latency
# 3) The slave can be configured with better hardware
# 4) The slave's exclusivity increases availability (Mozilla's public cluster my have time of high load)
# 4) The slave's exclusivity increases availability (Mozilla's public cluster may have high load)
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from datetime import datetime, timedelta
from bzETL import transform_bugzilla
from pyLibrary import convert
from pyLibrary.collections import MIN, Multiset
from pyLibrary.debugs import startup
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import coalesce, Dict
from pyLibrary.env import elasticsearch
from pyLibrary.env.elasticsearch import Cluster
from pyLibrary.env.files import File
from pyLibrary.queries import qb
from pyLibrary.thread.threads import ThreadedQueue
from pyLibrary.times.timer import Timer
far_back = datetime.utcnow() - timedelta(weeks=52)
@ -39,12 +41,12 @@ BATCH_SIZE = 1000
def extract_from_file(source_settings, destination):
file = File(source_settings.filename)
for g, d in Q.groupby(file, size=BATCH_SIZE):
for g, d in qb.groupby(file, size=BATCH_SIZE):
try:
d2 = map(
lambda (x): {"id": x.id, "value": x},
map(
lambda(x): transform_bugzilla.normalize(CNV.JSON2object(x)),
lambda(x): transform_bugzilla.normalize(convert.json2value(x)),
d
)
)
@ -61,8 +63,8 @@ def extract_from_file(source_settings, destination):
def get_last_updated(es):
if not isinstance(es, ElasticSearch):
return CNV.milli2datetime(0)
if not isinstance(es, elasticsearch.Index):
return convert.milli2datetime(0)
try:
results = es.search({
@ -70,7 +72,7 @@ def get_last_updated(es):
"query": {"match_all": {}},
"filter": {
"range": {
"modified_ts": {"gte": CNV.datetime2milli(far_back)}}}
"modified_ts": {"gte": convert.datetime2milli(far_back)}}}
}},
"from": 0,
"size": 0,
@ -79,8 +81,8 @@ def get_last_updated(es):
})
if results.facets.modified_ts.count == 0:
return CNV.milli2datetime(0)
return CNV.milli2datetime(results.facets.modified_ts.max)
return convert.milli2datetime(0)
return convert.milli2datetime(results.facets.modified_ts.max)
except Exception, e:
Log.error("Can not get_last_updated from {{host}}/{{index}}",{
"host": es.settings.host,
@ -102,13 +104,13 @@ def get_pending(es, since):
pending_bugs = None
for s, e in Q.intervals(0, max_bug+1, 100000):
for s, e in qb.intervals(0, max_bug+1, 100000):
Log.note("Collect history for bugs from {{start}}..{{end}}", {"start":s, "end":e})
result = es.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"and":[
{"range": {"modified_ts": {"gte": CNV.datetime2milli(since)}}},
{"range": {"modified_ts": {"gte": convert.datetime2milli(since)}}},
{"range": {"bug_id": {"gte": s, "lte": e}}}
]}
}},
@ -140,30 +142,30 @@ def get_pending(es, since):
# USE THE source TO GET THE INDEX SCHEMA
def get_or_create_index(destination_settings, source):
#CHECK IF INDEX, OR ALIAS, EXISTS
es = ElasticSearch(destination_settings)
es = elasticsearch.Index(destination_settings)
aliases = es.get_aliases()
indexes = [a for a in aliases if a.alias == destination_settings.index or a.index == destination_settings.index]
if not indexes:
#CREATE INDEX
schema = CNV.JSON2object(File(destination_settings.schema_file).read(), paths=True)
schema = convert.json2value(File(destination_settings.schema_file).read(), paths=True)
assert schema.settings
assert schema.mappings
ElasticSearch.create_index(destination_settings, schema, limit_replicas=True)
Cluster(destination_settings).create_index(destination_settings, schema, limit_replicas=True)
elif len(indexes) > 1:
Log.error("do not know how to replicate to more than one index")
elif indexes[0].alias != None:
destination_settings.alias = indexes[0].alias
destination_settings.index = indexes[0].index
return ElasticSearch(destination_settings)
return elasticsearch.Index(destination_settings)
def replicate(source, destination, pending, last_updated):
"""
COPY source RECORDS TO destination
"""
for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE):
for g, bugs in qb.groupby(pending, max_size=BATCH_SIZE):
with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}):
data = source.search({
"query": {"filtered": {
@ -171,7 +173,7 @@ def replicate(source, destination, pending, last_updated):
"filter": {"and": [
{"terms": {"bug_id": set(bugs)}},
{"range": {"expires_on":
{"gte": CNV.datetime2milli(last_updated)}
{"gte": convert.datetime2milli(last_updated)}
}}
]}
}},
@ -197,12 +199,12 @@ def main(settings):
#USE A SOURCE FILE
if settings.source.filename != None:
settings.destination.alias = settings.destination.index
settings.destination.index = ElasticSearch.proto_name(settings.destination.alias)
schema = CNV.JSON2object(File(settings.destination.schema_file).read(), paths=True, flexible=True)
settings.destination.index = Cluster.proto_name(settings.destination.alias)
schema = convert.json2value(File(settings.destination.schema_file).read(), paths=True, flexible=True)
if transform_bugzilla.USE_ATTACHMENTS_DOT:
schema = CNV.JSON2object(CNV.object2JSON(schema).replace("attachments_", "attachments."))
schema = convert.json2value(convert.value2json(schema).replace("attachments_", "attachments."))
dest = ElasticSearch.create_index(settings.destination, schema, limit_replicas=True)
dest = Cluster(settings.destination).create_index(settings.destination, schema, limit_replicas=True)
dest.set_refresh_interval(-1)
extract_from_file(settings.source, dest)
dest.set_refresh_interval(1)
@ -212,15 +214,15 @@ def main(settings):
else:
# SYNCH WITH source ES INDEX
source=ElasticSearch(settings.source)
source=elasticsearch.Index(settings.source)
# USE A DESTINATION FILE
if settings.destination.filename:
Log.note("Sending records to file: {{filename}}", {"filename":settings.destination.filename})
file = File(settings.destination.filename)
destination = Struct(
extend=lambda x: file.extend([CNV.object2JSON(v["value"]) for v in x]),
destination = Dict(
extend=lambda x: file.extend([convert.value2json(v["value"]) for v in x]),
file=file
)
else:
@ -229,17 +231,17 @@ def main(settings):
# GET LAST UPDATED
from_file = None
if time_file.exists:
from_file = CNV.milli2datetime(CNV.value2int(time_file.read()))
from_file = convert.milli2datetime(convert.value2int(time_file.read()))
from_es = get_last_updated(destination) - timedelta(hours=1)
last_updated = MIN(nvl(from_file, CNV.milli2datetime(0)), from_es)
last_updated = MIN(coalesce(from_file, convert.milli2datetime(0)), from_es)
Log.note("updating records with modified_ts>={{last_updated}}", {"last_updated":last_updated})
pending = get_pending(source, last_updated)
with ThreadedQueue(destination, size=1000) as data_sink:
with ThreadedQueue(destination, max_size=1000) as data_sink:
replicate(source, data_sink, pending, last_updated)
# RECORD LAST UPDATED
time_file.write(unicode(CNV.datetime2milli(current_time)))
time_file.write(unicode(convert.datetime2milli(current_time)))
def start():

Просмотреть файл

@ -1,12 +1,23 @@
# encoding: utf-8
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from datetime import date
import re
from pyLibrary.cnv import CNV
from pyLibrary.env import elasticsearch
from pyLibrary.env.logs import Log
from pyLibrary.queries import Q
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.env import elasticsearch
from pyLibrary.queries import qb
USE_ATTACHMENTS_DOT = True
@ -34,7 +45,7 @@ DATE_PATTERN_RELAXED = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2}")
def rename_attachments(bug_version):
if bug_version.attachments == None: return bug_version
if not USE_ATTACHMENTS_DOT:
bug_version.attachments=CNV.JSON2object(CNV.object2JSON(bug_version.attachments).replace("attachments.", "attachments_"))
bug_version.attachments=convert.json2value(convert.value2json(bug_version.attachments).replace("attachments.", "attachments_"))
return bug_version
@ -47,31 +58,31 @@ def normalize(bug, old_school=False):
#ENSURE STRUCTURES ARE SORTED
# Do some processing to make sure that diffing between runs stays as similar as possible.
bug.flags=Q.sort(bug.flags, "value")
bug.flags=qb.sort(bug.flags, "value")
if bug.attachments:
if USE_ATTACHMENTS_DOT:
bug.attachments=CNV.JSON2object(CNV.object2JSON(bug.attachments).replace("attachments_", "attachments."))
bug.attachments = Q.sort(bug.attachments, "attach_id")
bug.attachments=convert.json2value(convert.value2json(bug.attachments).replace("attachments_", "attachments."))
bug.attachments = qb.sort(bug.attachments, "attach_id")
for a in bug.attachments:
for k,v in list(a.items()):
if k.startswith("attachments") and (k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate")):
new_v=CNV.value2int(v)
new_v=convert.value2int(v)
new_k=k[12:]
a[k.replace(".", "\.")]=new_v
if not old_school:
a[new_k]=new_v
a.flags = Q.sort(a.flags, ["modified_ts", "value"])
a.flags = qb.sort(a.flags, ["modified_ts", "value"])
if bug.changes != None:
if USE_ATTACHMENTS_DOT:
json = CNV.object2JSON(bug.changes).replace("attachments_", "attachments.")
bug.changes=CNV.JSON2object(json)
bug.changes = Q.sort(bug.changes, ["attach_id", "field_name"])
json = convert.value2json(bug.changes).replace("attachments_", "attachments.")
bug.changes=convert.json2value(json)
bug.changes = qb.sort(bug.changes, ["attach_id", "field_name"])
#bug IS CONVERTED TO A 'CLEAN' COPY
bug = elasticsearch.scrub(bug)
# bug.attachments = nvl(bug.attachments, []) # ATTACHMENTS MUST EXIST
# bug.attachments = coalesce(bug.attachments, []) # ATTACHMENTS MUST EXIST
for f in NUMERIC_FIELDS:
@ -79,11 +90,11 @@ def normalize(bug, old_school=False):
if v == None:
continue
elif f in MULTI_FIELDS:
bug[f] = CNV.value2intlist(v)
elif CNV.value2number(v) == 0:
bug[f] = convert.value2intlist(v)
elif convert.value2number(v) == 0:
del bug[f]
else:
bug[f]=CNV.value2number(v)
bug[f]=convert.value2number(v)
# Also reformat some date fields
for dateField in ["deadline", "cf_due_date", "cf_last_resolved"]:
@ -91,7 +102,7 @@ def normalize(bug, old_school=False):
if v == None: continue
try:
if isinstance(v, date):
bug[dateField] = CNV.datetime2milli(v)
bug[dateField] = convert.datetime2milli(v)
elif isinstance(v, (long, int, float)) and len(unicode(v)) in [12, 13]:
bug[dateField] = v
elif not isinstance(v, basestring):
@ -100,17 +111,17 @@ def normalize(bug, old_school=False):
# Convert to "2012/01/01 00:00:00.000"
# Example: bug 856732 (cf_last_resolved)
# dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
bug[dateField] = convert.datetime2milli(convert.string2datetime(v+"000", "%Y/%m/%d %H:%M%:S%f"))
elif DATE_PATTERN_STRICT_SHORT.match(v):
# Convert "2012/01/01 00:00:00" to "2012-01-01T00:00:00.000Z", then to a timestamp.
# Example: bug 856732 (cf_last_resolved)
# dateString = v.substring(0, 10).replace("/", '-') + "T" + v.substring(11) + "Z"
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
bug[dateField] = convert.datetime2milli(convert.string2datetime(v.replace("-", "/"), "%Y/%m/%d %H:%M:%S"))
elif DATE_PATTERN_RELAXED.match(v):
# Convert "2012/01/01 00:00:00.000" to "2012-01-01"
# Example: bug 643420 (deadline)
# bug 726635 (cf_due_date)
bug[dateField] = CNV.datetime2milli(CNV.string2datetime(v[0:10], "%Y-%m-%d"))
bug[dateField] = convert.datetime2milli(convert.string2datetime(v[0:10], "%Y-%m-%d"))
except Exception, e:
Log.error("problem with converting date to milli (type={{type}}, value={{value}})", {"value":bug[dateField], "type":type(bug[dateField]).name}, e)

Просмотреть файл

@ -18,7 +18,7 @@ Module `meta`
**Description**
`@use_settings` will decorate a function to accept a `settings` parameter which is just like `**kwargs`, but the named parameters can override the properties in `settings`, rather than raise duplicate keyname exceptions.
`@use_settings` will decorate a function to accept a `settings` parameter which is just like `**kwargs`, but the other parameters can override the properties in `settings`, rather than raise duplicate keyname exceptions.
**Example**

Просмотреть файл

@ -10,7 +10,6 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import absolute_import
import HTMLParser
import StringIO
@ -23,11 +22,12 @@ import gzip
import hashlib
from io import BytesIO
import json
from numbers import Number
import re
from tempfile import TemporaryFile
from pyLibrary import strings, meta
from pyLibrary.dot import wrap, wrap_dot, unwrap
from pyLibrary.dot import wrap, wrap_dot, unwrap, unwraplist
from pyLibrary.collections.multiset import Multiset
from pyLibrary.debugs.logs import Log, Except
from pyLibrary.env.big_data import FileString, safe_size
@ -103,6 +103,7 @@ def json2value(json_string, params={}, flexible=False, paths=False):
if params:
json_string = expand_template(json_string, params)
# LOOKUP REFERENCES
value = wrap(json_decoder(json_string))
@ -113,10 +114,10 @@ def json2value(json_string, params={}, flexible=False, paths=False):
except Exception, e:
e = Except.wrap(e)
if ("Expecting '" in e and "' delimiter: line" in e) or "Expecting property name enclosed in double quotes: " in e:
if "Expecting '" in e and "' delimiter: line" in e:
line_index = int(strings.between(e.message, " line ", " column ")) - 1
column = int(strings.between(e.message, " column ", " ")) - 1
line = json_string.split("\n")[line_index].replace("\t", " ")
line = json_string.split("\n")[line_index]
if column > 20:
sample = "..." + line[column - 20:]
pointer = " " + (" " * 20) + "^"
@ -243,22 +244,52 @@ def list2tab(rows):
return "\t".join(keys) + "\n" + "\n".join(output)
def list2table(rows):
columns = set()
for r in rows:
columns |= set(r.keys())
keys = list(columns)
def list2table(rows, column_names=None):
if column_names:
keys = list(set(column_names))
else:
columns = set()
for r in rows:
columns |= set(r.keys())
keys = list(columns)
output = []
for r in rows:
output.append([r[k] for k in keys])
output = [[unwraplist(r[k]) for k in keys] for r in rows]
return wrap({
"meta": {"format": "table"},
"header": keys,
"data": output
})
def list2cube(rows, column_names=None):
if column_names:
keys = column_names
else:
columns = set()
for r in rows:
columns |= set(r.keys())
keys = list(columns)
data = {k: [] for k in keys}
output = wrap({
"meta": {"format": "cube"},
"edges": [
{
"name": "rownum",
"domain": {"type": "rownum", "min": 0, "max": len(rows), "interval": 1}
}
],
"data": data
})
for r in rows:
for k in keys:
data[k].append(r[k])
return output
def value2string(value):
# PROPER NULL HANDLING
if value == None:
@ -443,11 +474,15 @@ def bytes2sha1(value):
def value2intlist(value):
if value == None:
return None
elif isinstance(value, Number):
return [int(value)]
elif isinstance(value, basestring):
if value.strip() == "":
return None
return [int(value)]
elif hasattr(value, '__iter__'):
output = [int(d) for d in value if d != "" and d != None]
return output
elif value.strip() == "":
return None
else:
return [int(value)]
@ -547,7 +582,7 @@ def ini2value(ini_content):
buff = StringIO.StringIO(ini_content)
config = ConfigParser()
config.read(buff, "dummy")
config._read(buff, "dummy")
output = {}
for section in config.sections():

Просмотреть файл

@ -28,6 +28,7 @@ class Log_usingElasticSearch(BaseLog):
self.es = Cluster(settings).get_or_create_index(
schema=convert.json2value(convert.value2json(SCHEMA), paths=True),
limit_replicas=True,
tjson=True,
settings=settings
)
self.queue = self.es.threaded_queue(max_size=max_size, batch_size=batch_size)
@ -60,7 +61,7 @@ class Log_usingElasticSearch(BaseLog):
SCHEMA = {
"settings": {
"index.number_of_shards": 3,
"index.number_of_shards": 1,
"index.number_of_replicas": 2,
"index.store.throttle.type": "merge",
"index.store.throttle.max_bytes_per_sec": "2mb",
@ -73,13 +74,38 @@ SCHEMA = {
{
"values_strings": {
"match": "*",
"match_mapping_type" : "string",
"match_mapping_type": "string",
"mapping": {
"type": "string",
"index": "not_analyzed"
"index": "not_analyzed",
"doc_values": True
}
}
},
{
"default_doubles": {
"mapping": {
"index": "not_analyzed",
"type": "double",
"doc_values": True
},
"match_mapping_type": "double",
"match": "*"
}
},
{
"default_longs": {
"mapping": {
"index": "not_analyzed",
"type": "long",
"doc_values": True
},
"match_mapping_type": "long|integer",
"match_pattern": "regex",
"path_match": ".*"
}
}
],
"_all": {
"enabled": False
@ -90,11 +116,17 @@ SCHEMA = {
},
"properties": {
"timestamp": {
"type": "double",
"index": "not_analyzed",
"store": "yes"
"type": "object",
"properties": {
"$value": {
"type": "double",
"index": "not_analyzed",
"store": "yes",
"doc_values": True
}
}
},
"params": {
"params": { # JUST IN CASE WE ARE NOT USING TYPED JSON
"type": "object",
"enabled": False,
"index": "no",

Просмотреть файл

@ -256,9 +256,9 @@ class Log(object):
@classmethod
def error(
cls,
template, # human readable template
default_params={}, # parameters for template
cause=None, # pausible cause
template, # human readable template
default_params={}, # parameters for template
cause=None, # pausible cause
stack_depth=0, # stack trace offset (==1 if you do not want to report self)
**more_params
):
@ -270,6 +270,7 @@ class Log(object):
default_params = {}
params = dict(unwrap(default_params), **more_params)
add_to_trace = False
if cause == None:
cause = []

Просмотреть файл

@ -60,7 +60,7 @@ def read_settings(filename=None, defs=None):
Log.error("Can not file settings file {{filename}}", {
"filename": settings_file.abspath
})
settings = ref.get("file://" + settings_file.abspath)
settings = ref.get("file:///" + settings_file.abspath.replace(os.sep, "/"))
if defs:
settings.args = _argparse(defs)
return settings

Просмотреть файл

@ -68,7 +68,7 @@ different names and slightly different variations, some examples are:
* `jinja2.environment.Environment.getattr()` to allow convenient dot notation
* `argparse.Environment()` - code performs `setattr(e, name, value)` on
instances of Environment to provide dot(`.`) accessors
* `collections.namedtuple()` - gives attribute names to tuple indicies
* `collections.namedtuple()` - gives attribute names to tuple indices
effectively providing <code>a.b</code> rather than <code>a["b"]</code>
offered by dicts
* [configman's DotDict](https://github.com/mozilla/configman/blob/master/configman/dotdict.py)
@ -131,7 +131,9 @@ replaced with `None` in all cases.
###Identity and Absorbing (Zero) Elements###
With closure we can realize we have defined an algebraic semigroup: The identity element is the dot string (`"."`) and the zero element is `Null` (or `None`).
With closure we can realize we have defined an algebraic semigroup: The
identity element is the dot string (`"."`) and the zero element is `Null`
(or `None`).
1. `a[Null] == Null`
2. `a["."] == a`
@ -208,7 +210,7 @@ all `a<=b`
* Trinary slicing `[::]` uses the flat list definition
When assuming a *flat-list*, we loose the *take-from-the-right* tricks gained
from modulo arithmetic on the indicies. Therefore, we require extra methods
from modulo arithmetic on the indices. Therefore, we require extra methods
to perform right-based slicing:
* **right()** - `flat_list.right(b)` same as `loop_list[-b:]` except when `b<=0`
@ -231,9 +233,17 @@ The dot operator on a `DictList` performs a simple projection; it will return a
DictObject for data
-------------------
There are two major families of objects in Object Oriented programming. The first, are ***Actors***: characterized by a number of useful instance methods and some state bundled into a package. The second are ***Data***: Primarily a set of properties, with only (de)serialization functions, or algebraic operators defined. Boto has many examples of these *Data* classes, [here is one](https://github.com/boto/boto/blob/4b8269562e663f090403e57ba1a3a471b6e0aa0e/boto/ec2/networkinterface.py).
There are two major families of objects in Object Oriented programming. The
first, are ***Actors***: characterized by a number of useful instance methods
and some state bundled into a package. The second are ***Data***: Primarily
a set of properties, with only (de)serialization functions, or algebraic
operators defined. Boto has many examples of these *Data* classes,
[here is one](https://github.com/boto/boto/blob/4b8269562e663f090403e57ba1a3a471b6e0aa0e/boto/ec2/networkinterface.py).
The problem with *Data* objects is they have an useless distinction between attributes and properties. This prevents us from using the `[]` operator for dereferencing, forcing use to use the verbose `__getattr__()` instead. It also prevents the use of query operators over these objects.
The problem with *Data* objects is they have an useless distinction between
attributes and properties. This prevents us from using the `[]` operator for
dereferencing, forcing use to use the verbose `getattr()` instead. It
also prevents the use of query operators over these objects.
You can register a class as a *data* class, by wrapping it with `DictClass`.

Просмотреть файл

@ -64,7 +64,9 @@ def split_field(field):
"""
RETURN field AS ARRAY OF DOT-SEPARATED FIELDS
"""
if field.find(".") >= 0:
if field == "." or field==None:
return []
elif field.find(".") >= 0:
field = field.replace("\.", "\a")
return [k.replace("\a", ".") for k in field.split(".")]
else:
@ -75,7 +77,10 @@ def join_field(field):
"""
RETURN field SEQUENCE AS STRING
"""
return ".".join([f.replace(".", "\.") for f in field])
potent = [f for f in field if f != "."]
if not potent:
return "."
return ".".join([f.replace(".", "\.") for f in potent])
def hash_value(v):
@ -128,7 +133,13 @@ def _all_default(d, default, seen=None):
if existing_value == None:
if default_value != None:
_set_attr(d, [k], default_value)
try:
_set_attr(d, [k], default_value)
except Exception, e:
if PATH_NOT_FOUND not in e:
from pyLibrary.debugs.logs import Log
Log.error("Can not set attribute {{name}}", name=k, cause=e)
elif (hasattr(existing_value, "__setattr__") or isinstance(existing_value, Mapping)) and isinstance(default_value, Mapping):
df = seen.get(id(existing_value))
if df:
@ -143,13 +154,13 @@ def _getdefault(obj, key):
TRY BOTH ATTRIBUTE AND ITEM ACCESS, OR RETURN Null
"""
try:
return getattr(obj, key)
except Exception, e:
return obj[key]
except Exception, f:
pass
try:
return obj[key]
except Exception, f:
return getattr(obj, key)
except Exception, e:
pass
try:
@ -242,11 +253,13 @@ def _get_attr(obj, path):
obj = getattr(obj, attr_name)
return _get_attr(obj, path[1:])
except Exception, e:
try:
obj = obj[attr_name]
return _get_attr(obj, path[1:])
except Exception, f:
return None
pass
try:
obj = obj[attr_name]
return _get_attr(obj, path[1:])
except Exception, f:
return None
def _set_attr(obj, path, value):
@ -270,7 +283,7 @@ def _set_attr(obj, path, value):
new_value = value
try:
_get(obj, "__setattr__")(attr_name, new_value)
setattr(obj, attr_name, new_value)
return old_value
except Exception, e:
try:

Просмотреть файл

@ -70,13 +70,19 @@ class Dict(MutableMapping):
def __getitem__(self, key):
if key == None:
return Null
if key == ".":
output = _get(self, "_dict")
if isinstance(output, Mapping):
return self
else:
return output
if isinstance(key, str):
key = key.decode("utf8")
elif not isinstance(key, unicode):
from pyLibrary.debugs.logs import Log
Log.error("only string keys are supported")
d = _get(self, "_dict")
if key.find(".") >= 0:
@ -96,6 +102,13 @@ class Dict(MutableMapping):
from pyLibrary.debugs.logs import Log
Log.error("key is empty string. Probably a bad idea")
if key == ".":
# SOMETHING TERRIBLE HAPPENS WHEN value IS NOT A Mapping;
# HOPEFULLY THE ONLY OTHER METHOD RUN ON self IS unwrap()
v = unwrap(value)
_set(self, "_dict", v)
return v
if isinstance(key, str):
key = key.decode("utf8")
@ -257,13 +270,13 @@ class Dict(MutableMapping):
try:
return "Dict("+dict.__str__(_get(self, "_dict"))+")"
except Exception, e:
return "{}"
return "Dict{}"
def __repr__(self):
try:
return "Dict("+dict.__repr__(_get(self, "_dict"))+")"
except Exception, e:
return "Dict{}"
return "Dict()"
class _DictUsingSelf(dict):
@ -460,7 +473,6 @@ class _DictUsingSelf(dict):
return "Dict()"
# KEEP TRACK OF WHAT ATTRIBUTES ARE REQUESTED, MAYBE SOME (BUILTIN) ARE STILL USEFUL
requested = set()

Просмотреть файл

@ -18,14 +18,14 @@ from pyLibrary.dot import wrap, unwrap
_get = object.__getattribute__
_set = object.__setattr__
dictwrap = None
_dictwrap = None
def _late_import():
global dictwrap
from pyLibrary.dot.objects import dictwrap
global _dictwrap
from pyLibrary.dot.objects import dictwrap as _dictwrap
_ = dictwrap
_ = _dictwrap
class DictList(list):
"""
@ -82,10 +82,10 @@ class DictList(list):
"""
simple `select`
"""
if not dictwrap:
if not _dictwrap:
_late_import()
return DictList(vals=[unwrap(dictwrap(v)[key]) for v in _get(self, "list")])
return DictList(vals=[unwrap(_dictwrap(v)[key]) for v in _get(self, "list")])
def filter(self, _filter):
return DictList(vals=[unwrap(u) for u in (wrap(v) for v in _get(self, "list")) if _filter(u)])
@ -112,6 +112,9 @@ class DictList(list):
Log.warning("slicing is broken in Python 2.7: a[i:j] == a[i+len(a), j] sometimes. Use [start:stop:step] (see https://github.com/klahnakoski/pyLibrary/blob/master/pyLibrary/dot/README.md#the-slice-operator-in-python27-is-inconsistent)")
return self[i:j:]
def __list__(self):
return self.list
def copy(self):
return DictList(list(_get(self, "list")))

Просмотреть файл

@ -22,6 +22,9 @@ WRAPPED_CLASSES = set()
class DictObject(Mapping):
"""
TREAT AN OBJECT LIKE DATA
"""
def __init__(self, obj):
_set(self, "_obj", obj)
@ -90,12 +93,16 @@ def dictwrap(v):
m = Dict()
_set(m, "_dict", v) # INJECT m.__dict__=v SO THERE IS NO COPY
return m
elif type_ is Dict:
return v
elif type_ is NoneType:
return None # So we allow `is None`
return None # So we allow `is None` (OFTEN USED IN PYTHON LIBRARIES)
elif type_ is list:
return DictList(v)
elif type_ is GeneratorType:
return (wrap(vv) for vv in v)
elif hasattr(v, "as_dict"):
return v.as_dict()
elif isinstance(v, (basestring, int, float, Decimal, datetime, date, Dict, DictList, NullType, NoneType)):
return v
else:

275
pyLibrary/env/elasticsearch.py поставляемый
Просмотреть файл

@ -18,19 +18,28 @@ import time
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import coalesce, Null, Dict, set_default, join_field, split_field
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap
from pyLibrary.env import http
from pyLibrary.jsons.typed_encoder import json2typed
from pyLibrary.maths.randoms import Random
from pyLibrary.maths import Math
from pyLibrary.meta import use_settings
from pyLibrary.queries import qb
from pyLibrary.strings import utf82unicode
from pyLibrary.dot import coalesce, Null, Dict, set_default
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, unwrap
from pyLibrary.thread.threads import ThreadedQueue, Thread
from pyLibrary.thread.threads import ThreadedQueue, Thread, Lock
class Index(object):
ES_NUMERIC_TYPES = ["long", "integer", "double", "float"]
ES_PRIMITIVE_TYPES = ["string", "boolean", "integer", "date", "long", "double"]
class Features(object):
pass
class Index(Features):
"""
AN ElasticSearch INDEX LIFETIME MANAGEMENT TOOL
@ -53,18 +62,17 @@ class Index(object):
alias=None,
explore_metadata=True, # PROBING THE CLUSTER FOR METADATA IS ALLOWED
read_only=True,
tjson=False, # STORED AS TYPED JSON
timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
debug=False, # DO NOT SHOW THE DEBUG STATEMENTS
settings=None
):
if index==None:
Log.error("not allowed")
if index == alias:
Log.error("must have a unique index name")
self.cluster_state = None
self.cluster_metadata = None
self.debug = debug
if self.debug:
Log.alert("elasticsearch debugging for index {{index}} is on", index=settings.index)
@ -73,16 +81,19 @@ class Index(object):
self.cluster = Cluster(settings)
try:
index = self.get_index(index)
if index and alias==None:
full_index = self.get_index(index)
if full_index and alias==None:
settings.alias = settings.index
settings.index = index
if index == None:
settings.index = full_index
if full_index==None:
Log.error("not allowed")
if type == None:
# NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT?
indices = self.cluster.get_metadata().indices
index_ = indices[self.settings.index]
with self.cluster.metadata_locker:
index_ = self.cluster._metadata.indices[self.settings.index]
if not index_:
indices = self.cluster.get_metadata(index=self.settings.index).indices
index_ = indices[self.settings.index]
candidate_types = list(index_.mappings.keys())
if len(candidate_types) != 1:
@ -90,13 +101,16 @@ class Index(object):
self.settings.type = type = candidate_types[0]
except Exception, e:
# EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
pass
Log.error("not expected", cause=e)
self.path = "/" + index + "/" + type
if not type:
Log.error("not allowed")
self.path = "/" + full_index + "/" + type
@property
def url(self):
return self.cluster.path + "/" + self.path
return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/")
def get_schema(self, retry=True):
if self.settings.explore_metadata:
@ -134,25 +148,25 @@ class Index(object):
def add_alias(self, alias=None):
if alias:
self.cluster_state = None
self.cluster._post(
self.cluster.post(
"/_aliases",
data=convert.unicode2utf8(convert.value2json({
data={
"actions": [
{"add": {"index": self.settings.index, "alias": alias}}
]
})),
},
timeout=coalesce(self.settings.timeout, 30)
)
else:
# SET ALIAS ACCORDING TO LIFECYCLE RULES
self.cluster_state = None
self.cluster._post(
self.cluster.post(
"/_aliases",
data=convert.unicode2utf8(convert.value2json({
data={
"actions": [
{"add": {"index": self.settings.index, "alias": self.settings.alias}}
]
})),
},
timeout=coalesce(self.settings.timeout, 30)
)
@ -160,9 +174,10 @@ class Index(object):
"""
RETURN THE INDEX USED BY THIS alias
"""
alias_list = self.cluster.get_aliases()
output = sort([
a.index
for a in self.cluster.get_aliases()
for a in alias_list
if a.alias == alias or
a.index == alias or
(re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias)
@ -186,7 +201,7 @@ class Index(object):
return True
def flush(self):
self.cluster._post("/" + self.settings.index + "/_refresh")
self.cluster.post("/" + self.settings.index + "/_refresh")
def delete_record(self, filter):
if self.settings.read_only:
@ -250,7 +265,10 @@ class Index(object):
Log.error("Expecting every record given to have \"value\" or \"json\" property")
lines.append('{"index":{"_id": ' + convert.value2json(id) + '}}')
lines.append(json)
if self.settings.tjson:
lines.append(json2typed(json))
else:
lines.append(json)
del records
if not lines:
@ -263,7 +281,7 @@ class Index(object):
Log.error("can not make request body from\n{{lines|indent}}", lines=lines, cause=e)
response = self.cluster._post(
response = self.cluster.post(
self.path + "/_bulk",
data=data_bytes,
headers={"Content-Type": "text"},
@ -279,7 +297,7 @@ class Index(object):
error=item.index.error,
line=lines[i * 2 + 1]
)
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
if item.index.status not in [200, 201]:
Log.error(
"{{num}} {{error}} while loading line into {{index}}:\n{{line}}",
@ -323,7 +341,7 @@ class Index(object):
Log.error("Can not set refresh interval ({{error}})", {
"error": utf82unicode(response.all_content)
})
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
elif any(map(self.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
response = self.cluster.put(
"/" + self.settings.index + "/_settings",
data=convert.unicode2utf8('{"index":{"refresh_interval":' + convert.value2json(interval) + '}}')
@ -347,9 +365,9 @@ class Index(object):
else:
show_query = query
Log.note("Query:\n{{query|indent}}", query=show_query)
return self.cluster._post(
return self.cluster.post(
self.path + "/_search",
data=convert.value2json(query).encode("utf8"),
data=query,
timeout=coalesce(timeout, self.settings.timeout)
)
except Exception, e:
@ -374,24 +392,41 @@ class Index(object):
self.cluster.delete_index(index=self.settings.index)
known_clusters = {}
class Cluster(object):
@use_settings
def __init__(self, host, port=9200, settings=None):
def __new__(cls, host, port=9200, settings=None):
if not isinstance(port, int):
Log.error("port must be integer")
cluster = known_clusters.get((host, port))
if cluster:
return cluster
cluster = object.__new__(cls)
known_clusters[(host, port)] = cluster
return cluster
@use_settings
def __init__(self, host, port=9200, explore_metadata=True, settings=None):
"""
settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
"""
if hasattr(self, "settings"):
return
settings.setdefault("explore_metadata", True)
self.cluster_state = None
self.cluster_metadata = None
self.debug = settings.debug
self.settings = settings
self.cluster_state = None
self._metadata = None
self.metadata_locker = Lock()
self.debug = settings.debug
self.version = None
self.path = settings.host + ":" + unicode(settings.port)
self.get_metadata()
@use_settings
def get_or_create_index(
self,
@ -400,6 +435,7 @@ class Cluster(object):
schema=None,
limit_replicas=None,
read_only=False,
tjson=False,
settings=None
):
best = self._get_best(settings)
@ -489,6 +525,7 @@ class Cluster(object):
schema=None,
limit_replicas=None,
read_only=False,
tjson=False,
settings=None
):
if not settings.alias:
@ -518,7 +555,7 @@ class Cluster(object):
)
schema.settings.index.number_of_replicas = health.number_of_nodes - 1
self._post(
self.post(
"/" + settings.index,
data=convert.value2json(schema).encode("utf8"),
headers={"Content-Type": "application/json"}
@ -542,9 +579,9 @@ class Cluster(object):
RETURN LIST OF {"alias":a, "index":i} PAIRS
ALL INDEXES INCLUDED, EVEN IF NO ALIAS {"alias":Null}
"""
data = self.get_metadata().indices
data = self.get("/_cluster/state")
output = []
for index, desc in data.items():
for index, desc in data.metadata.indices.items():
if not desc["aliases"]:
output.append({"index": index, "alias": None})
else:
@ -552,29 +589,38 @@ class Cluster(object):
output.append({"index": index, "alias": a})
return wrap(output)
def get_metadata(self):
if self.settings.explore_metadata:
if not self.cluster_metadata:
response = self.get("/_cluster/state")
self.cluster_metadata = wrap(response.metadata)
self.cluster_state = wrap(self.get("/"))
self.version = self.cluster_state.version.number
else:
Log.error("Metadata exploration has been disabled")
return self.cluster_metadata
def get_metadata(self, index=None, force=False):
with self.metadata_locker:
if self.settings.explore_metadata:
if not self._metadata or (force and index is None):
response = self.get("/_cluster/state")
self._metadata = wrap(response.metadata)
self.cluster_state = wrap(self.get("/"))
self.version = self.cluster_state.version.number
elif index: # UPDATE THE MAPPING FOR ONE INDEX ONLY
response = self.get("/"+index+"/_mapping")
self._metadata.indices[index].mappings = qb.sort(response.items(), 0).last()[1].mappings
return Dict(indices={index: self._metadata.indices[index]})
else:
Log.error("Metadata exploration has been disabled")
return self._metadata
def _post(self, path, **kwargs):
def post(self, path, **kwargs):
url = self.settings.host + ":" + unicode(self.settings.port) + path
try:
wrap(kwargs).headers["Accept-Encoding"] = "gzip,deflate"
if "data" in kwargs and not isinstance(kwargs["data"], str):
data = kwargs.get(b'data')
if data == None:
pass
elif isinstance(data, Mapping):
kwargs[b'data'] = data =convert.unicode2utf8(convert.value2json(data))
elif not isinstance(kwargs["data"], str):
Log.error("data must be utf8 encoded string")
if self.debug:
sample = kwargs.get("data", "")[:300]
sample = kwargs.get(b'data', "")[:300]
Log.note("{{url}}:\n{{data|indent}}", url=url, data=sample)
response = http.post(url, **kwargs)
@ -597,9 +643,12 @@ class Cluster(object):
suggestion = ""
if kwargs.get("data"):
Log.error("Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
Log.error(
"Problem with call to {{url}}" + suggestion + "\n{{body|left(10000)}}",
url=url,
body=kwargs["data"][0:10000] if self.debug else kwargs["data"][0:100], cause=e)
body=kwargs["data"][0:10000] if self.debug else kwargs["data"][0:100],
cause=e
)
else:
Log.error("Problem with call to {{url}}" + suggestion, url=url, cause=e)
@ -718,7 +767,7 @@ def _scrub(r):
if len(output) == 1:
return output[0]
try:
return sort(output) # SUCCESS ONLY ON STRINGS, OR NUMBERS
return sort(output)
except Exception:
return output
else:
@ -728,7 +777,7 @@ def _scrub(r):
class Alias(object):
class Alias(Features):
@use_settings
def __init__(
self,
@ -751,11 +800,13 @@ class Alias(object):
Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata. Do not know what to do now.")
indices = self.cluster.get_metadata().indices
if not self.settings.alias or self.settings.alias == self.settings.index:
candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
index = qb.sort(candidates, 0).last()[1]
if not self.settings.alias or self.settings.alias==self.settings.index:
alias_list = self.cluster.get("/_alias/"+self.settings.index)
candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()]
full_name = qb.sort(candidates, 0).last()[0]
index = self.cluster.get("/" + full_name + "/_mapping")[full_name]
else:
index = indices[self.settings.index]
index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]
# FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
max_prop = -1
@ -773,7 +824,7 @@ class Alias(object):
@property
def url(self):
return self.cluster.path + "/" + self.path
return self.cluster.path.rstrip("/") + "/" + self.path.lstrip("/")
def get_schema(self, retry=True):
if self.settings.explore_metadata:
@ -871,8 +922,8 @@ class Alias(object):
show_query.facets = {k: "..." for k in query.facets.keys()}
else:
show_query = query
Log.note("Query:\n{{query|indent}}", query= show_query)
return self.cluster._post(
Log.note("Query:\n{{query|indent}}", query=show_query)
return self.cluster.post(
self.path + "/_search",
data=convert.value2json(query).encode("utf8"),
timeout=coalesce(timeout, self.settings.timeout)
@ -886,6 +937,98 @@ class Alias(object):
)
def parse_properties(parent_index_name, parent_query_path, esProperties):
"""
RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
"""
from pyLibrary.queries.meta import Column
columns = DictList()
for name, property in esProperties.items():
if parent_query_path:
index_name, query_path = parent_index_name, join_field(split_field(parent_query_path) + [name])
else:
index_name, query_path = parent_index_name, name
if property.type == "nested" and property.properties:
# NESTED TYPE IS A NEW TYPE DEFINITION
# MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH
self_columns = parse_properties(index_name, query_path, property.properties)
for c in self_columns:
if not c.nested_path:
c.nested_path = [query_path]
else:
c.nested_path.insert(0, query_path)
columns.extend(self_columns)
columns.append(Column(
table=index_name,
name=query_path,
abs_name=query_path,
type="nested",
nested_path=[name]
))
continue
if property.properties:
child_columns = parse_properties(index_name, query_path, property.properties)
columns.extend(child_columns)
columns.append(Column(
table=index_name,
name=query_path,
abs_name=query_path,
type="object"
))
if property.dynamic:
continue
if not property.type:
continue
if property.type == "multi_field":
property.type = property.fields[name].type # PULL DEFAULT TYPE
for i, (n, p) in enumerate(property.fields.items()):
if n == name:
# DEFAULT
columns.append(Column(
table=index_name,
name=query_path,
type=p.type
))
else:
columns.append(Column(
table=index_name,
name=query_path + "." + n,
type=p.type
))
continue
if property.type in ["string", "boolean", "integer", "date", "long", "double"]:
columns.append(Column(
table=index_name,
name=query_path,
abs_name=query_path,
type=property.type
))
if property.index_name and name != property.index_name:
columns.append(Column(
table=index_name,
name=property.index_name,
type=property.type
))
elif property.enabled == None or property.enabled == False:
columns.append(Column(
table=index_name,
name=query_path,
abs_name=query_path,
type="object"
))
else:
Log.warning("unknown type {{type}} for property {{path}}", type=property.type, path=query_path)
return columns
def _merge_mapping(a, b):
"""
MERGE TWO MAPPINGS, a TAKES PRECEDENCE
@ -990,5 +1133,3 @@ _merge_type = {
}
}

3
pyLibrary/env/files.py поставляемый
Просмотреть файл

@ -16,7 +16,8 @@ import shutil
from pyLibrary.strings import utf82unicode
from pyLibrary.maths import crypto
from pyLibrary.dot import coalesce
from pyLibrary.dot import coalesce, set_default, split_field, join_field
from pyLibrary.dot import listwrap, wrap
from pyLibrary import convert

2
pyLibrary/env/http.py поставляемый
Просмотреть файл

@ -99,7 +99,7 @@ def request(method, url, **kwargs):
if " Read timed out." in e:
Log.error("Timeout failure (timeout was {{timeout}}", timeout=timeout, cause=e)
else:
Log.error("Request failure", e)
Log.error("Request failure of {{url}}", url=url, cause=e)
def _to_ascii_dict(headers):

Просмотреть файл

@ -5,7 +5,7 @@ import json
import re
from types import NoneType
from pyLibrary.dot import DictList, NullType
from pyLibrary.dot import DictList, NullType, Dict, unwrap
from pyLibrary.dot.objects import DictObject
from pyLibrary.times.dates import Date
@ -22,10 +22,10 @@ def _late_import():
global datetime2unix
global utf82unicode
from pyLibrary.debugs.logs import Log
from pyLibrary.debugs.logs import Log as _Log
from pyLibrary.convert import datetime2unix, utf82unicode
_ = Log
_ = _Log
_ = datetime2unix
_ = utf82unicode
@ -50,7 +50,6 @@ def replace(match):
def quote(value):
value
return "\"" + ESCAPE.sub(replace, value) + "\""
@ -82,6 +81,8 @@ def _scrub(value, is_done):
return utf82unicode(value)
elif type_ is Decimal:
return float(value)
elif type_ is Dict:
return _scrub(unwrap(value), is_done)
elif isinstance(value, Mapping):
_id = id(value)
if _id in is_done:

Просмотреть файл

@ -36,7 +36,7 @@ json_decoder = json.JSONDecoder().decode
# THE DEFAULT JSON ENCODERS CAN NOT HANDLE A DIVERSITY OF TYPES *AND* BE FAST
#
# 1) WHEN USING cPython, WE HAVE NO COMPILER OPTIMIZATIONS: THE BEST STRATEGY IS TO
# CONVERT THE MEMORY STRUCTURE TO STANDARD TYPES AND SEND TO THE INSANELY FAST
# CONVERT THE MEMORY STRUCTURE TO STANDARD TYPES AND SEND TO THE INSANELY FAST
# DEFAULT JSON ENCODER
# 2) WHEN USING PYPY, WE USE CLEAR-AND-SIMPLE PROGRAMMING SO THE OPTIMIZER CAN DO
# ITS JOB. ALONG WITH THE UnicodeBuilder WE GET NEAR C SPEEDS
@ -67,11 +67,14 @@ except Exception, e:
append = UnicodeBuilder.append
_dealing_with_problem = False
def pypy_json_encode(value, pretty=False):
"""
pypy DOES NOT OPTIMIZE GENERATOR CODE WELL
"""
global _dealing_with_problem
if pretty:
return pretty_json(value)
@ -83,14 +86,23 @@ def pypy_json_encode(value, pretty=False):
except Exception, e:
# THE PRETTY JSON WILL PROVIDE MORE DETAIL ABOUT THE SERIALIZATION CONCERNS
from pyLibrary.debugs.logs import Log
Log.warning("Serialization of JSON problems", e)
if _dealing_with_problem:
Log.error("Serialization of JSON problems", e)
else:
Log.warning("Serialization of JSON problems", e)
_dealing_with_problem = True
try:
return pretty_json(value)
except Exception, f:
Log.error("problem serializing object", f)
finally:
_dealing_with_problem = False
almost_pattern = r"(?:\.(\d*)999)|(?:\.(\d*)000)"
def float_repr(value):
output = repr(value)
d = output.find(".")
@ -107,13 +119,14 @@ def float_repr(value):
else:
return output
json_encoder_module.FLOAT_REPR = float_repr
class cPythonJSONEncoder(object):
def __init__(self):
object.__init__(self)
self.encoder = json.JSONEncoder(
skipkeys=False,
ensure_ascii=False, # DIFF FROM DEFAULTS
@ -135,6 +148,7 @@ class cPythonJSONEncoder(object):
return unicode(self.encoder.encode(scrubbed))
except Exception, e:
from pyLibrary.debugs.logs import Log, Except
e = Except.wrap(e)
Log.warning("problem serializing {{type}}", type=_repr(value), cause=e)
raise e
@ -242,7 +256,6 @@ def _dict2json(value, _buffer):
append(_buffer, u"}")
ARRAY_ROW_LENGTH = 80
ARRAY_ITEM_MAX_LENGTH = 30
ARRAY_MAX_COLUMNS = 10
@ -262,7 +275,7 @@ def pretty_json(value):
from pyLibrary.debugs.logs import Log
try:
Log.note("try explicit convert of string with length {{length}}", length= len(value))
Log.note("try explicit convert of string with length {{length}}", length=len(value))
acc = [u"\""]
for c in value:
try:
@ -277,7 +290,7 @@ def pretty_json(value):
# Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g)
acc.append(u"\"")
output = u"".join(acc)
Log.note("return value of length {{length}}", length= len(output))
Log.note("return value of length {{length}}", length=len(output))
return output
except BaseException, f:
Log.warning("can not even explicit convert", f)
@ -291,8 +304,8 @@ def pretty_json(value):
return "{" + quote(unicode(items[0][0])) + ": " + pretty_json(items[0][1]).strip() + "}"
items = sorted(items, lambda a, b: value_compare(a[0], b[0]))
values = [quote(unicode(k))+": " + indent(pretty_json(v)).strip() for k, v in items if v != None]
return "{\n" + INDENT + (",\n"+INDENT).join(values) + "\n}"
values = [quote(unicode(k)) + ": " + indent(pretty_json(v)).strip() for k, v in items if v != None]
return "{\n" + INDENT + (",\n" + INDENT).join(values) + "\n}"
except Exception, e:
from pyLibrary.debugs.logs import Log
from pyLibrary.collections import OR
@ -309,7 +322,7 @@ def pretty_json(value):
if not value:
return "[]"
if ARRAY_MAX_COLUMNS==1:
if ARRAY_MAX_COLUMNS == 1:
return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"
if len(value) == 1:
@ -323,14 +336,14 @@ def pretty_json(value):
max_len = max(*[len(j) for j in js])
if max_len <= ARRAY_ITEM_MAX_LENGTH and max(*[j.find("\n") for j in js]) == -1:
# ALL TINY VALUES
num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0)/float(max_len+2))))) # +2 TO COMPENSATE FOR COMMAS
if len(js)<=num_columns: # DO NOT ADD \n IF ONLY ONE ROW
num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS
if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW
return "[" + ", ".join(js) + "]"
if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN
return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"
content = ",\n".join(
", ".join(j.rjust(max_len) for j in js[r:r+num_columns])
", ".join(j.rjust(max_len) for j in js[r:r + num_columns])
for r in xrange(0, len(js), num_columns)
)
return "[\n" + indent(content) + "\n]"
@ -363,13 +376,13 @@ def pretty_json(value):
return "null"
else:
try:
if int(value)==value:
if int(value) == value:
return str(int(value))
except Exception, e:
pass
try:
if float(value)==value:
if float(value) == value:
return str(float(value))
except Exception, e:
pass
@ -450,13 +463,11 @@ def datetime2milli(d, type):
_repr_ = Repr()
_repr_.maxlevel = 2
def _repr(obj):
return _repr_.repr(obj)
# OH HUM, cPython with uJSON, OR pypy WITH BUILTIN JSON?
# http://liangnuren.wordpress.com/2012/08/13/python-json-performance/
# http://morepypy.blogspot.ca/2011/10/speeding-up-json-encoding-in-pypy.html

Просмотреть файл

@ -207,7 +207,7 @@ def get_file(ref, url):
except Exception, e:
try:
new_value = _convert.ini2value(content)
except Exception, f:
except Exception:
raise _Log.error("Can not read {{file}}", file=path, cause=e)
new_value = _replace_ref(new_value, ref)
return new_value

Просмотреть файл

@ -218,7 +218,13 @@ def json2typed(json):
mode = VALUE
elif c == ",":
mode = context.pop()
elif c in "]}":
if mode != OBJECT:
context.append(mode)
mode = VALUE
elif c in "]":
mode = context.pop()
elif c in "}":
mode = context.pop()
mode = context.pop()
elif c == '"':
context.append(mode)
@ -276,6 +282,8 @@ def json2typed(json):
context.append(mode)
context.append(KEYWORD)
mode = STRING
elif c == ",":
pass
elif c == '}':
mode = context.pop()
else:

Просмотреть файл

@ -234,7 +234,7 @@ class Math(object):
@staticmethod
def MAX(values):
output = None
output = Null
for v in values:
if v == None:
continue

Просмотреть файл

@ -124,14 +124,14 @@ def use_settings(func):
def wrapper(*args, **kwargs):
try:
if func.func_name == "__init__" and "settings" in kwargs:
if func.func_name in ("__init__", "__new__") and "settings" in kwargs:
packed = params_pack(params, kwargs, dot.zip(params[1:], args[1:]), kwargs["settings"], defaults)
return func(args[0], **packed)
elif func.func_name == "__init__" and len(args) == 2 and len(kwargs) == 0 and isinstance(args[1], Mapping):
elif func.func_name in ("__init__", "__new__") and len(args) == 2 and len(kwargs) == 0 and isinstance(args[1], Mapping):
# ASSUME SECOND UNNAMED PARAM IS settings
packed = params_pack(params, args[1], defaults)
return func(args[0], **packed)
elif func.func_name == "__init__":
elif func.func_name in ("__init__", "__new__"):
# DO NOT INCLUDE self IN SETTINGS
packed = params_pack(params, kwargs, dot.zip(params[1:], args[1:]), defaults)
return func(args[0], **packed)

Просмотреть файл

@ -1,19 +1,18 @@
from urlparse import urlparse
from pyLibrary.dot import wrap
from urlparse import urlparse, parse_qs
from pyLibrary.dot import Null, coalesce, wrap
from pyLibrary.dot.dicts import Dict
_convert = None
_Log = None
convert = None
Log = None
def _late_import():
global _convert
global _Log
from pyLibrary import convert as _convert
from pyLibrary.debugs.logs import Log as _Log
_ = _convert
_ = _Log
global convert
global Log
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
names = ["path", "query", "fragment"]
indicator = ["/", "?", "#"]
@ -50,7 +49,7 @@ class URL(object):
if value == None:
return
if not _convert:
if not convert:
_late_import()
if value.startswith("file://") or value.startswith("//"):
# urlparse DOES NOT WORK IN THESE CASES
@ -58,7 +57,7 @@ class URL(object):
self.scheme = scheme.rstrip(":")
parse(self, suffix, 0, 1)
self.query = wrap(_convert.url_param2value(self.query))
self.query = wrap(convert.url_param2value(self.query))
self.fragment = self.fragment
else:
output = urlparse(value)
@ -66,7 +65,7 @@ class URL(object):
self.port = output.port
self.host = output.netloc.split(":")[0]
self.path = output.path
self.query = wrap(_convert.url_param2value(output.query))
self.query = wrap(convert.url_param2value(output.query))
self.fragment = output.fragment
def __nonzero__(self):
@ -90,9 +89,9 @@ class URL(object):
if self.path:
url += str(self.path)
if self.query:
url = url + '?' + _convert.value2url(self.query)
url = url + '?' + convert.value2url(self.query)
if self.fragment:
url = url + '#' + _convert.value2url(self.fragment)
url = url + '#' + convert.value2url(self.fragment)
return url

Просмотреть файл

@ -0,0 +1,6 @@
MoQueries
=========
A Python library that supports [Qb queries](https://github.com/klahnakoski/ActiveData/blob/dev/docs/Qb_Tutorial.md "Qb Queries"), and a variety of other set-operations.

Просмотреть файл

@ -0,0 +1,87 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from collections import Mapping
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import wrap, set_default, split_field
from pyLibrary.dot.dicts import Dict
from pyLibrary.queries import containers
type2container = Dict()
config = Dict() # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE
_ListContainer = None
def _delayed_imports():
global type2container
global _ListContainer
from pyLibrary.queries.containers.lists import ListContainer as _ListContainer
_ = _ListContainer
from pyLibrary.queries.qb_usingMySQL import MySQL
from pyLibrary.queries.qb_usingES import FromES
from pyLibrary.queries.meta import FromESMetadata
set_default(type2container, {
"elasticsearch": FromES,
"mysql": MySQL,
"memory": None,
"meta": FromESMetadata
})
def wrap_from(frum, schema=None):
"""
:param frum:
:param schema:
:return:
"""
if not type2container:
_delayed_imports()
frum = wrap(frum)
if isinstance(frum, basestring):
if not containers.config.default.settings:
Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info")
type_ = None
index = frum
if frum.startswith("meta."):
type_ = "meta"
else:
index = split_field(frum)[0]
settings = set_default(
{
"index": index,
"name": frum,
"type": type_
},
containers.config.default.settings
)
return type2container[settings.type](settings)
elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]:
# TODO: Ensure the frum.name is set, so we capture the deep queries
if not frum.type:
Log.error("Expecting from clause to have a 'type' property")
return type2container[frum.type](frum.settings)
elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))):
from pyLibrary.queries.query import Query
return Query(frum, schema=schema)
elif isinstance(frum, list):
return _ListContainer(frum)
else:
return frum
import es09.util

Просмотреть файл

@ -16,7 +16,7 @@ from copy import copy
from types import GeneratorType
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, split_field, wrap
from pyLibrary.dot import set_default, split_field, wrap, DictList
from pyLibrary.dot.dicts import Dict
type2container = Dict()
@ -25,7 +25,7 @@ _ListContainer = None
_Cube = None
_run = None
_Query = None
_Normal = None
def _delayed_imports():
global type2container
@ -33,6 +33,7 @@ def _delayed_imports():
global _Cube
global _run
global _Query
global _Normal
from pyLibrary.queries.qb_usingMySQL import MySQL as _MySQL
from pyLibrary.queries.qb_usingES import FromES as _FromES
@ -49,10 +50,11 @@ def _delayed_imports():
_ = _run
_ = _Query
_ = _Normal
class Container(object):
__slots__ = ["data", "schema"]
__slots__ = ["data", "schema", "namespaces"]
@classmethod
def new_instance(type, frum, schema=None):
@ -100,8 +102,14 @@ class Container(object):
def __init__(self, frum, schema=None):
object.__init__(self)
if not type2container:
_delayed_imports()
self.data = frum
if isinstance(schema, list):
Log.error("expecting map from abs_name to column object")
self.schema = schema
# self.namespaces = wrap([_Normal()])
def query(self, query):
if query.frum != self:
@ -135,7 +143,7 @@ class Container(object):
_ = format
Log.error("not implemented")
def get_columns(self, frum):
def get_columns(self, table):
"""
USE THE frum TO DETERMINE THE COLUMNS
"""

Просмотреть файл

@ -16,11 +16,11 @@ from pyLibrary import convert
from pyLibrary.collections.matrix import Matrix
from pyLibrary.collections import MAX, OR
from pyLibrary.queries.containers import Container
# from pyLibrary.queries.query import _normalize_edge
from pyLibrary.dot import Null, Dict
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, wrap_dot, listwrap
from pyLibrary.debugs.logs import Log
from pyLibrary.queries.query import _normalize_edge
class Cube(Container):
@ -272,7 +272,7 @@ class Cube(Container):
if len(self.edges)==1 and self.edges[0].domain.type=="index":
# USE THE STANDARD LIST FILTER
from pyLibrary.queries import qb
return qb.filter(where, self.data.values()[0].cube)
return qb.filter(self.data.values()[0].cube, where)
else:
# FILTER DOES NOT ALTER DIMESIONS, JUST WHETHER THERE ARE VALUES IN THE CELLS
Log.unexpected("Incomplete")

Просмотреть файл

@ -10,26 +10,37 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict, wrap
from pyLibrary.dot import Dict, wrap, listwrap, unwraplist
from pyLibrary.queries import qb
from pyLibrary.queries.containers import Container
from pyLibrary.queries.expressions import TRUE_FILTER
from pyLibrary.queries.list.aggs import is_aggs, list_aggs
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.expressions import TRUE_FILTER, qb_expression_to_python
from pyLibrary.queries.lists.aggs import is_aggs, list_aggs
from pyLibrary.queries.meta import Column
from pyLibrary.thread.threads import Lock
class ListContainer(Container):
def __init__(self, frum, schema=None):
Container.__init__(self, frum, schema)
self.frum = list(frum)
#TODO: STORE THIS LIKE A CUBE FOR FASTER ACCESS AND TRANSFORMATION
frum = list(frum)
if schema == None:
self.schema = get_schema_from_list(frum)
Container.__init__(self, frum, schema)
self.frum = frum
@property
def query_path(self):
return None
def query(self, q):
frum = self.frum
frum = self
if is_aggs(q):
frum = list_aggs(frum, q)
frum = list_aggs(frum.data, q)
else: # SETOP
try:
if q.filter != None or q.esfilter != None:
@ -51,22 +62,55 @@ class ListContainer(Container):
return frum.format(q.format)
def update(self, command):
"""
EXPECTING command == {"set":term, "clear":term, "where":where}
THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
THE where CLAUSE IS AN ES FILTER
"""
command = wrap(command)
if command.where==None:
filter_ = lambda: True
else:
filter_ = _exec("temp = lambda row: "+qb_expression_to_python(command.where))
for c in self.data:
if filter_(c):
for k in command["clear"].keys():
c[k] = None
for k, v in command.set.items():
c[k] = v
def filter(self, where):
return self.where(where)
def where(self, where):
_ = where
Log.error("not implemented")
if isinstance(where, Mapping):
temp = None
exec("def temp(row):\n return "+qb_expression_to_python(where))
else:
temp = where
return ListContainer(filter(temp, self.data), self.schema)
def sort(self, sort):
_ = sort
Log.error("not implemented")
return ListContainer(qb.sort(self.data, sort), self.schema)
def select(self, select):
_ = select
Log.error("not implemented")
selects = listwrap(select)
if selects[0].value == "*" and selects[0].name == ".":
return self
for s in selects:
if not isinstance(s.value, basestring) or not is_keyword(s.value):
Log.error("selecting on structure, or expressions, not supported yet")
#TODO: DO THIS WITH JUST A SCHEMA TRANSFORM, DO NOT TOUCH DATA
#TODO: HANDLE STRUCTURE AND EXPRESSIONS
new_schema = {s.name: self.schema[s.value] for s in selects}
new_data = [{s.name: d[s.value] for s in selects} for d in self.data]
return ListContainer(frum=new_data, schema=new_schema)
def window(self, window):
_ = window
@ -78,17 +122,35 @@ class ListContainer(Container):
def format(self, format):
if format == "table":
frum = convert.list2table(self.data)
frum.meta.format = "table"
frum = convert.list2table(self.data, self.schema.keys())
elif format == "cube":
frum = convert.list2cube(self.data, self.schema.keys())
else:
frum = wrap({
"meta": {"format": "list"},
"data": self.data
})
frum = self
def get_columns(self, frum):
return frum
def insert(self, documents):
self.data.extend(documents)
def extend(self, documents):
self.data.extend(documents)
def add(self, doc):
self.data.append(doc)
def to_dict(self):
return wrap({
"meta": {"format": "list"},
"data": [{k: unwraplist(v) for k, v in row.items()} for row in self.data]
})
def get_columns(self, table=None):
return self.schema.values()
def __getitem__(self, item):
return self.data[item]
def get_schema_from_list(frum):
"""
@ -98,7 +160,7 @@ def get_schema_from_list(frum):
_get_schema_from_list(frum, columns, [], 0)
return columns
def _get_schema_from_list(frum, columns, prefix, depth):
def _get_schema_from_list(frum, columns, prefix, nested_path):
"""
SCAN THE LIST FOR COLUMN TYPES
"""
@ -111,14 +173,21 @@ def _get_schema_from_list(frum, columns, prefix, depth):
names[name] = new_type
if this_type == "object":
_get_schema_from_list([value], columns, prefix + [name], depth)
_get_schema_from_list([value], columns, prefix + [name], nested_path)
elif this_type == "nested":
_get_schema_from_list(value, columns, prefix + [name], depth+1)
if not nested_path:
_get_schema_from_list(value, columns, prefix + [name], [name])
else:
_get_schema_from_list(value, columns, prefix + [name], [nested_path[0]+"."+name]+nested_path)
for n, t in names.items():
full_name = ".".join(prefix + [n])
column = {"name": full_name, "value": full_name, "type": t, "depth": depth}
columns[full_name] = column
column = Column(
name=full_name,
type=t,
nested_path=nested_path
)
columns[columns.name] = column
_type_to_name = {
@ -229,3 +298,7 @@ _merge_type = {
def _exec(code):
temp = None
exec code
return temp

Просмотреть файл

@ -14,7 +14,7 @@ import itertools
from pyLibrary.collections.matrix import Matrix
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import listwrap
from pyLibrary.dot import listwrap, unwrap
from pyLibrary.queries import windows
from pyLibrary.queries.containers.cube import Cube
from pyLibrary.queries.domains import SimpleSetDomain, DefaultDomain

Просмотреть файл

@ -26,16 +26,20 @@ DEFAULT_QUERY_LIMIT = 20
class Dimension(Container):
__slots__ = ["name", "full_name", "where", "type", "limit", "index", "parent", "edges", "partitions", "fields"]
def __init__(self, dim, parent, qb):
dim = wrap(dim)
self.name = dim.name
self.parent = parent
self.parent = coalesce(parent)
self.full_name = join_field(split_field(self.parent.full_name)+[self.name])
self.edges = None # FOR NOW
dot.set_default(self, dim)
self.esfilter = dim.esfilter
self.where = dim.where
self.type = coalesce(dim.type, "set")
self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT)
self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name)
self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.settings.index)
if not self.index:
Log.error("Expecting an index name")
@ -61,18 +65,19 @@ class Dimension(Container):
if dim.partitions:
return # ALREADY HAVE PARTS
if dim.type not in KNOWN - ALGEBRAIC:
if self.type not in KNOWN - ALGEBRAIC:
return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH
qb.get_columns()
with Timer("Get parts of {{name}}", {"name": self.name}):
parts = qb.query({
"from": self.index,
"select": {"name": "count", "aggregate": "count"},
"edges": edges,
"esfilter": self.esfilter,
"where": self.where,
"limit": self.limit
})
Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts))
Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts))
d = parts.edges[0].domain
@ -101,7 +106,7 @@ class Dimension(Container):
if p:
partitions.append({
"value": g,
"esfilter": {"and": [
"where": {"and": [
{"term": {e.value: g[e.name]}}
for e in edges
]},
@ -116,7 +121,7 @@ class Dimension(Container):
{
"name": str(d.partitions[i].name), # CONVERT TO STRING
"value": d.getEnd(d.partitions[i]),
"esfilter": {"term": {edges[0].value: d.partitions[i].value}},
"where": {"term": {edges[0].value: d.partitions[i].value}},
"count": count
}
for i, count in enumerate(parts)
@ -142,13 +147,13 @@ class Dimension(Container):
{
"name": str(d.partitions[i].name), # CONVERT TO STRING
"value": d.getEnd(d.partitions[i]),
"esfilter": {"term": {edges[0].value: d.partitions[i].value}},
"where": {"term": {edges[0].value: d.partitions[i].value}},
"count": SUM(subcube),
"partitions": [
{
"name": str(d2.partitions[j].name), # CONVERT TO STRING
"value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])),
"esfilter": {"and": [
"where": {"and": [
{"term": {edges[0].value: d.partitions[i].value}},
{"term": {edges[1].value: d2.partitions[j].value}}
]},
@ -165,11 +170,17 @@ class Dimension(Container):
parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def __getitem__(self, item):
return self.__getattr__(item)
def __getattr__(self, key):
"""
RETURN CHILD EDGE OR PARTITION BY NAME
"""
#TODO: IGNORE THE STANDARD DIMENSION PROPERTIES TO AVOID ACCIDENTAL SELECTION OF EDGE OR PART
if key in Dimension.__slots__:
return None
e = self.edges[key]
if e:
return e
@ -187,14 +198,14 @@ class Dimension(Container):
# USE EACH EDGE AS A PARTITION, BUT isFacet==True SO IT ALLOWS THE OVERLAP
partitions = [
{
"name":v.name,
"value":v.name,
"esfilter":v.esfilter,
"style":v.style,
"weight":v.weight # YO! WHAT DO WE *NOT* COPY?
"name": v.name,
"value": v.name,
"where": v.where,
"style": v.style,
"weight": v.weight # YO! WHAT DO WE *NOT* COPY?
}
for i, v in enumerate(self.edges)
if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.esfilter
if i < coalesce(self.limit, DEFAULT_QUERY_LIMIT) and v.where
]
self.isFacet = True
elif kwargs.depth == None: # ASSUME self.fields IS A dict
@ -205,7 +216,7 @@ class Dimension(Container):
partitions.append({
"name":part.name,
"value":part.value,
"esfilter":part.esfilter,
"where":part.where,
"style":coalesce(part.style, part.parent.style),
"weight":part.weight # YO! WHAT DO WE *NOT* COPY?
})
@ -214,7 +225,7 @@ class Dimension(Container):
{
"name":v.name,
"value":v.value,
"esfilter":v.esfilter,
"where":v.where,
"style":v.style,
"weight":v.weight # YO! WHAT DO WE *NOT* COPY?
}
@ -232,7 +243,7 @@ class Dimension(Container):
partitions.append({
"name":join_field(split_field(subpart.parent.name) + [subpart.name]),
"value":subpart.value,
"esfilter":subpart.esfilter,
"where":subpart.where,
"style":coalesce(subpart.style, subpart.parent.style),
"weight":subpart.weight # YO! WHAT DO WE *NOT* COPY?
})
@ -324,12 +335,12 @@ def parse_partition(part):
p.value = coalesce(p.value, p.name)
p.parent = part
if not part.esfilter:
if not part.where:
if len(part.partitions) > 100:
Log.error("Must define an esfilter on {{name}} there are too many partitions ({{num_parts}})",
Log.error("Must define an where on {{name}} there are too many partitions ({{num_parts}})",
name= part.name,
num_parts= len(part.partitions))
# DEFAULT esfilter IS THE UNION OF ALL CHILD FILTERS
# DEFAULT where IS THE UNION OF ALL CHILD FILTERS
if part.partitions:
part.esfilter = {"or": part.partitions.esfilter}
part.where = {"or": part.partitions.where}

Просмотреть файл

@ -14,16 +14,18 @@ from collections import Mapping
from numbers import Number
import re
import itertools
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.maths import Math
from pyLibrary.queries.unique_index import UniqueIndex
from pyLibrary.dot import coalesce, Dict, set_default, Null, listwrap
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, unwrap
from pyLibrary.dot import wrap
from pyLibrary.times.dates import Date
from pyLibrary.times.durations import Duration
ALGEBRAIC = {"time", "duration", "numeric", "count", "datetime"} # DOMAINS THAT HAVE ALGEBRAIC OPERATIONS DEFINED
KNOWN = {"set", "boolean", "duration", "time", "numeric"} # DOMAINS THAT HAVE A KNOWN NUMBER FOR PARTS AT QUERY TIME
PARTITION = {"uid", "set", "boolean"} # DIMENSIONS WITH CLEAR PARTS
@ -132,6 +134,7 @@ class DefaultDomain(Domain):
self.partitions = DictList()
self.map = dict()
self.map[None] = self.NULL
self.limit = desc.get('limit')
def compare(self, a, b):
return value_compare(a.value, b.value)
@ -162,6 +165,7 @@ class DefaultDomain(Domain):
def as_dict(self):
output = Domain.as_dict(self)
output.partitions = self.partitions
output.limit = self.limit
return output
@ -284,6 +288,8 @@ class SimpleSetDomain(Domain):
return self.partitions[index]
def getKeyByIndex(self, index):
if index < 0 or index >= len(self.partitions):
return None
return self.partitions[index][self.key]
def getKey(self, part):
@ -533,6 +539,70 @@ class DurationDomain(Domain):
return output
class NumericDomain(Domain):
__slots__ = ["max", "min"]
def __new__(cls, **desc):
if not desc.get('partitions') and not desc.get('interval'):
return object.__new__(cls)
else:
return object.__new__(RangeDomain)
def __init__(self, **desc):
Domain.__init__(self, **desc)
self.min = desc.get('min')
self.max = desc.get('max')
def compare(self, a, b):
return value_compare(a, b)
def getCanonicalPart(self, part):
return part
def getIndexByKey(self, key):
return key
def getPartByKey(self, key):
if self.min!=None and key < self.min:
return self.NULL
if self.max!=None and key >= self.max:
return self.NULL
return key
def getKey(self, part):
return part
def getKeyByIndex(self, index):
return index
def as_dict(self):
output = Domain.as_dict(self)
output.min = self.min
output.max = self.max
return output
class UniqueDomain(Domain):
__slots__ = ()
def compare(self, a, b):
return value_compare(a, b)
def getCanonicalPart(self, part):
return part
def getPartByKey(self, key):
return key
def getKey(self, part):
return part
def getEnd(self, value):
return value
class RangeDomain(Domain):
__slots__ = ["max", "min", "interval", "partitions", "NULL"]
@ -640,9 +710,10 @@ name_to_type = {
"value": ValueDomain,
"default": DefaultDomain,
"set": SimpleSetDomain,
"uid": DefaultDomain,
"time": TimeDomain,
"duration": DurationDomain,
"range": RangeDomain
"range": NumericDomain,
"uid": UniqueDomain,
"numeric": NumericDomain
}

Просмотреть файл

@ -82,12 +82,8 @@ class _MVEL(object):
path = split_field(fromPath)
# ADD LOCAL VARIABLES
from pyLibrary.queries.es09.util import INDEX_CACHE
columns = INDEX_CACHE[path[0]].columns
for i, c in enumerate(columns):
if c.name=="attachments":
Log.debug("")
if c.name.find("\\.") >= 0:
self.prefixMap.insert(0, {
"path": c.name,

Просмотреть файл

@ -10,7 +10,6 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from copy import deepcopy
from datetime import datetime
from pyLibrary import convert
@ -21,11 +20,9 @@ from pyLibrary.debugs.logs import Log
from pyLibrary.maths import Math
from pyLibrary.queries import domains
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import split_field, join_field, coalesce
from pyLibrary.dot import split_field, join_field, coalesce, Null
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap
from pyLibrary.queries import qb
from pyLibrary.queries.es09 import expressions
from pyLibrary.queries.es09.expressions import value2MVEL, isKeyword
from pyLibrary.queries.expressions import simplify_esfilter
from pyLibrary.times import durations
@ -34,21 +31,19 @@ from pyLibrary.times import durations
TrueFilter = {"match_all": {}}
DEBUG = False
INDEX_CACHE = {} # MATCH NAMES TO ES URL AND COLUMNS eg {name:{"url":url, "columns":columns"}}
def post(es, FromES, limit):
if not FromES.facets and FromES.size == 0 and not FromES.aggs:
def post(es, es_query, limit):
if not es_query.facets and es_query.size == 0 and not es_query.aggs:
Log.error("FromES is sending no facets")
# DO NOT KNOW WHY THIS WAS HERE
# if isinstance(query.select, list) or len(query.edges) and not FromES.facets.keys and FromES.size == 0:
# Log.error("FromES is sending no facets")
postResult = None
post_result = None
try:
postResult = es.search(FromES)
post_result = es.search(es_query)
for facetName, f in postResult.facets.items():
for facetName, f in post_result.facets.items():
if f._type == "statistical":
continue
if not f.terms:
@ -59,7 +54,7 @@ def post(es, FromES, limit):
except Exception, e:
Log.error("Error with FromES", e)
return postResult
return post_result
def build_es_query(query):
@ -86,90 +81,7 @@ def build_es_query(query):
return output
def parse_columns(parent_path, esProperties):
"""
RETURN THE COLUMN DEFINITIONS IN THE GIVEN esProperties OBJECT
"""
columns = DictList()
for name, property in esProperties.items():
if parent_path:
path = join_field(split_field(parent_path) + [name])
else:
path = name
if property.type == "nested" and property.properties:
# NESTED TYPE IS A NEW TYPE DEFINITION
# MARKUP CHILD COLUMNS WITH THE EXTRA DEPTH
child_columns = deepcopy(parse_columns(path, property.properties))
self_columns = deepcopy(child_columns)
for c in self_columns:
c.depth += 1
columns.extend(self_columns)
columns.append({
"name": join_field(split_field(path)[1::]),
"type": "nested",
"useSource": False
})
if path not in INDEX_CACHE:
pp = split_field(parent_path)
for i in qb.reverse(range(len(pp))):
c = INDEX_CACHE.get(join_field(pp[:i + 1]), None)
if c:
INDEX_CACHE[path] = c.copy()
break
else:
Log.error("Can not find parent")
INDEX_CACHE[path].name = path
INDEX_CACHE[path].columns = child_columns
continue
if property.properties:
child_columns = parse_columns(path, property.properties)
columns.extend(child_columns)
columns.append({
"name": join_field(split_field(path)[1::]),
"type": "object",
"useSource": False
})
if property.dynamic:
continue
if not property.type:
continue
if property.type == "multi_field":
property.type = property.fields[name].type # PULL DEFAULT TYPE
for i, (n, p) in enumerate(property.fields.items()):
if n == name:
# DEFAULT
columns.append({"name": join_field(split_field(path)[1::]), "type": p.type, "useSource": p.index == "no"})
else:
columns.append({"name": join_field(split_field(path)[1::]) + "." + n, "type": p.type, "useSource": p.index == "no"})
continue
if property.type in ["string", "boolean", "integer", "date", "long", "double"]:
columns.append({
"name": join_field(split_field(path)[1::]),
"type": property.type,
"useSource": property.index == "no"
})
if property.index_name and name != property.index_name:
columns.append({
"name": property.index_name,
"type": property.type,
"useSource": property.index == "no"
})
elif property.enabled == None or property.enabled == False:
columns.append({
"name": join_field(split_field(path)[1::]),
"type": "object",
"useSource": True
})
else:
Log.warning("unknown type {{type}} for property {{path}}", type= property.type, path= path)
return columns
def compileTime2Term(edge):
@ -200,7 +112,7 @@ def compileTime2Term(edge):
if Math.round(value) == 0:
return edge.domain.NULL
d = datetime(str(value)[:4:], str(value).right(2), 1)
d = datetime(str(value)[:4:], str(value)[-2:], 1)
d = d.addMilli(offset)
return edge.domain.getPartByKey(d)
else:

Просмотреть файл

@ -12,21 +12,22 @@ from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary import convert
from pyLibrary.collections import MAX
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import listwrap, Dict, wrap, literal_field, set_default, coalesce, Null, split_field, join_field
from pyLibrary.maths import Math
from pyLibrary.queries import qb, es09
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.domains import PARTITION, SimpleSetDomain, is_keyword
from pyLibrary.queries.es14.util import aggregates1_4
from pyLibrary.queries.domains import PARTITION, SimpleSetDomain, is_keyword, DefaultDomain
from pyLibrary.queries.es14.util import aggregates1_4, NON_STATISTICAL_AGGS
from pyLibrary.queries.expressions import simplify_esfilter, qb_expression_to_ruby, get_all_vars
from pyLibrary.queries.query import DEFAULT_LIMIT
from pyLibrary.times.timer import Timer
def is_aggsop(es, query):
es.cluster.get_metadata()
if any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])) and (query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate)):
if any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])) and (query.edges or query.groupby or any(a != None and a != "none" for a in listwrap(query.select).aggregate)):
return True
return False
@ -40,24 +41,60 @@ def es_aggsop(es, frum, query):
for s in select:
if s.aggregate == "count" and (s.value == None or s.value == "."):
s.pull = "doc_count"
elif s.value == ".":
if frum.typed:
# STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING
if s.aggregate in NON_STATISTICAL_AGGS:
#TODO: HANDLE BOTH $value AND $objects TO COUNT
Log.error("do not know how to handle")
else:
s.value = "$value"
new_select["$value"] += [s]
else:
if s.aggregate in NON_STATISTICAL_AGGS:
#TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT
Log.error("do not know how to handle")
else:
Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate)
elif is_keyword(s.value):
new_select[literal_field(s.value)] += [s]
else:
formula.append(s)
for litral_field, many in new_select.items():
if len(many)>1:
canonical_name=literal_field(many[0].name)
es_query.aggs[canonical_name].stats.field = many[0].value
for canonical_name, many in new_select.items():
representative = many[0]
if representative.value == ".":
Log.error("do not know how to handle")
else:
field_name = representative.value
if len(many) > 1 or many[0].aggregate in ("median", "percentile"):
# canonical_name=literal_field(many[0].name)
for s in many:
if s.aggregate == "count":
s.pull = canonical_name + ".count"
es_query.aggs[literal_field(canonical_name)].stats.field = field_name
s.pull = literal_field(canonical_name) + ".count"
elif s.aggregate == "median":
#ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
key=literal_field(canonical_name + " percentile")
es_query.aggs[key].percentiles.field = field_name
es_query.aggs[key].percentiles.percents += [50]
s.pull = key + ".values.50\.0"
elif s.aggregate == "percentile":
#ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
key=literal_field(canonical_name + " percentile")
percent = Math.round(s.percentile * 100, decimal=6)
es_query.aggs[key].percentiles.field = field_name
es_query.aggs[key].percentiles.percents += [percent]
s.pull = key + ".values." + literal_field(unicode(percent))
else:
s.pull = canonical_name + "." + aggregates1_4[s.aggregate]
es_query.aggs[literal_field(canonical_name)].stats.field = field_name
s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate]
else:
s = many[0]
s.pull = literal_field(s.value) + ".value"
es_query.aggs[literal_field(s.value)][aggregates1_4[s.aggregate]].field = s.value
es_query.aggs[literal_field(canonical_name)][aggregates1_4[representative.aggregate]].field = field_name
representative.pull = literal_field(canonical_name) + ".value"
for i, s in enumerate(formula):
new_select[unicode(i)] = s
@ -71,6 +108,8 @@ def es_aggsop(es, frum, query):
start += d.num_columns
if query.where:
#TODO: INCLUDE FILTERS ON EDGES
filter = simplify_esfilter(query.where)
es_query = Dict(
aggs={"_filter": set_default({"filter": filter}, es_query)}
@ -79,13 +118,18 @@ def es_aggsop(es, frum, query):
if len(split_field(frum.name)) > 1:
es_query = wrap({
"size": 0,
"aggs": {"_nested": set_default({
"nested": {
"path": join_field(split_field(frum.name)[1::])
}
}, es_query)}
"aggs": {"_nested": set_default(
{
"nested": {
"path": frum.query_path
}
},
es_query
)}
})
es_query.size=0
with Timer("ES query time") as es_duration:
result = es09.util.post(es, es_query, query.limit)
@ -109,10 +153,35 @@ def es_aggsop(es, frum, query):
class AggsDecoder(object):
def __new__(cls, *args, **kwargs):
e = args[0]
def __new__(cls, e=None, query=None, *args, **kwargs):
if query.groupby:
# GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE
e.allowNulls = False
else:
e.allowNulls = coalesce(e.allowNulls, True)
if e.value and e.domain.type == "default":
return object.__new__(DefaultDecoder, e.copy())
if query.groupby:
return object.__new__(DefaultDecoder, e.copy())
if is_keyword(e.value):
cols = query.frum.get_columns()
col = cols.filter(lambda c: c.name == e.value)[0]
if not col:
return object.__new__(DefaultDecoder, e.copy())
limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)
if col.partitions != None:
e.domain = SimpleSetDomain(partitions=col.partitions[:limit:])
else:
e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict())
return object.__new__(DefaultDecoder, e.copy())
elif isinstance(e.value, (list, Mapping)):
Log.error("Not supported yet")
else:
return object.__new__(DefaultDecoder, e.copy())
if e.value and e.domain.type in PARTITION:
return object.__new__(SetDecoder, e)
if isinstance(e.domain.dimension, Dimension):
@ -167,10 +236,33 @@ class AggsDecoder(object):
class SetDecoder(AggsDecoder):
def append_query(self, es_query, start):
self.start = start
return wrap({"aggs": {
"_match": set_default({"terms": {"field": self.edge.value}}, es_query),
"_missing": set_default({"missing": {"field": self.edge.value}}, es_query),
}})
domain = self.edge.domain
include = [p[domain.key] for p in domain.partitions]
if self.edge.allowNulls:
return wrap({"aggs": {
"_match": set_default({"terms": {
"field": self.edge.value,
"size": 0,
"include": include
}}, es_query),
"_missing": set_default(
{"filter": {"or": [
{"missing": {"field": self.edge.value}},
{"not": {"terms": {self.edge.value: include}}}
]}},
es_query
),
}})
else:
return wrap({"aggs": {
"_match": set_default({"terms": {
"field": self.edge.value,
"size": 0,
"include": include
}}, es_query)
}})
def get_value(self, index):
return self.edge.domain.getKeyByIndex(index)
@ -216,7 +308,7 @@ def _range_composer(edge, domain, es_query, to_float):
missing_filter = set_default(
{"filter": {"or": [
missing_range,
{"missing": {"field": get_all_vars(edge.value)}}
{"or": [{"missing": {"field": v}} for v in get_all_vars(edge.value)]}
]}},
es_query
)
@ -332,7 +424,7 @@ class DefaultDecoder(SetDecoder):
def __init__(self, edge, query):
AggsDecoder.__init__(self, edge, query)
self.edge = self.edge.copy()
self.edge.allowNulls = False # SINCE WE DO NOT KNOW THE DOMAIN, WE HAVE NO SENSE OF WHAT IS OUTSIDE THAT DOMAIN, allowNulls==True MAKES NO SENSE
# self.edge.allowNulls = False # SINCE WE DO NOT KNOW THE DOMAIN, WE HAVE NO SENSE OF WHAT IS OUTSIDE THAT DOMAIN, allowNulls==True MAKES NO SENSE
self.edge.domain.partitions = set()
self.edge.domain.limit = coalesce(self.edge.domain.limit, query.limit, 10)

Просмотреть файл

@ -0,0 +1,204 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http:# mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pyLibrary import queries
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import split_field, DictList, listwrap, literal_field, wrap, coalesce, Dict
from pyLibrary.queries import es09
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.es14.setop import format_dispatch
from pyLibrary.queries.es14.util import qb_sort_to_es_sort
from pyLibrary.queries.expressions import query_get_all_vars, qb_expression_to_ruby, expression_map, qb_expression_to_esfilter
from pyLibrary.queries.unique_index import UniqueIndex
from pyLibrary.thread.threads import Thread
from pyLibrary.times.timer import Timer
def is_deepop(es, query):
if query.edges or query.groupby:
return False
vars = query_get_all_vars(query)
columns = query.frum.get_columns()
if len(split_field(query.frum.name)) > 1:
return True
if any(c for c in columns if c.nested_path and c.name in vars):
return True
return False
def es_deepop(es, query):
columns = query.frum.get_columns()
query_path = query.frum.query_path
columns = UniqueIndex(keys=["name"], data=sorted(columns, lambda a, b: cmp(len(b.nested_path), len(a.nested_path))), fail_on_dup=False)
_map = {c.name: c.abs_name for c in columns}
where = qb_expression_to_esfilter(expression_map(query.where, _map))
more_filter = {
"and": [
where,
{"not": {
"nested": {
"path": query_path,
"filter": {
"match_all": {}
}
}
}}
]
}
es_query = wrap({
"query": {
"nested": {
"path": query_path,
"inner_hits": {},
"filter": where
},
},
"fields": []
})
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
es_query.sort = qb_sort_to_es_sort(query.sort)
is_list = isinstance(query.select, list)
new_select = DictList()
def get_pull(column):
if column.nested_path:
return "_inner" + column.abs_name[len(column.nested_path[0]):]
else:
return "fields." + literal_field(column.abs_name)
i = 0
for s in listwrap(query.select):
if s.value == "*":
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
for c in columns:
if c.relative and c.type not in ["nested", "object"]:
if not c.nested_path:
es_query.fields.append(c.abs_name)
new_select.append({
"name": c.name,
"pull": get_pull(c),
"nested_path": c.nested_path[0],
"put": {"name": c.name, "index": i, "child": "."}
})
i += 1
# REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
col_names = [c.name for c in columns if c.relative]
for n in new_select:
if n.name.startswith("..") and n.name.lstrip(".") not in col_names:
n.name = n.put.name = n.name.lstrip(".")
elif s.value == ".":
for c in columns:
if c.relative and c.type not in ["nested", "object"]:
if not c.nested_path:
es_query.fields.append(c.abs_name)
new_select.append({
"name": c.name,
"pull": get_pull(c),
"nested_path": c.nested_path[0],
"put": {"name": ".", "index": i, "child": c.abs_name}
})
i += 1
elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]):
parent = s.value[:-1]
prefix = len(parent)
for c in columns:
if c.name.startswith(parent):
pull = get_pull(c)
if len(c.nested_path) < 0:
es_query.fields.append(c.abs_name)
new_select.append({
"name": s.name + "." + c.name[prefix:],
"pull": pull,
"nested_path": c.nested_path[0],
"put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."}
})
elif isinstance(s.value, basestring) and is_keyword(s.value):
parent = s.value + "."
prefix = len(parent)
net_columns = [c for c in columns if c.name.startswith(parent)]
if not net_columns:
c = columns[(s.value,)]
pull = get_pull(c)
if not c.nested_path:
es_query.fields.append(s.value)
new_select.append({
"name": s.name if is_list else ".",
"pull": pull,
"nested_path": c.nested_path[0],
"put": {"name": s.name, "index": i, "child": "."}
})
else:
for n in net_columns:
pull = get_pull(n)
if not n.nested_path:
es_query.fields.append(n.abs_name)
new_select.append({
"name": s.name if is_list else ".",
"pull": pull,
"nested_path": n.nested_path[0],
"put": {"name": s.name, "index": i, "child": n[prefix:]}
})
i += 1
elif isinstance(s.value, list):
Log.error("need an example")
es_query.fields.extend([v for v in s.value])
else:
Log.error("need an example")
es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)}
new_select.append({
"name": s.name if is_list else ".",
"value": s.name,
"put": {"name": s.name, "index": i, "child": "."}
})
i += 1
more = []
def get_more(please_stop):
more.append(es09.util.post(
es,
Dict(
query={"filtered": {"filter": more_filter}},
fields=es_query.fields
),
query.limit
))
need_more=Thread.run("get more", target=get_more)
with Timer("call to ES") as call_timer:
data = es09.util.post(es, es_query, query.limit)
# RETURN A LIST OF INNER OBJECTS
def inners():
for t in data.hits.hits:
for i in t.inner_hits[query_path].hits.hits:
t._inner = i._source
yield t
Thread.join(need_more)
for t in more[0].hits.hits:
yield t
try:
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
output = formatter(inners(), new_select, query)
output.meta.es_response_time = call_timer.duration
output.meta.content_type = mime_type
output.meta.es_query = es_query
return output
except Exception, e:
Log.error("problem formatting", e)

Просмотреть файл

@ -15,6 +15,7 @@ from pyLibrary import convert
from pyLibrary.collections.matrix import Matrix
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict, set_default, coalesce, wrap
from pyLibrary.maths import Math
from pyLibrary.queries.containers.cube import Cube
from pyLibrary.queries.es14.aggs import count_dim, aggs_iterator, format_dispatch
@ -27,7 +28,7 @@ def format_cube(decoders, aggs, start, query, select):
coord = tuple(d.get_index(row) for d in decoders)
for s, m in matricies:
try:
if m[coord]:
if m[coord]: # THIS CAN HAPPEN WHEN THE SET QUERIED IS SMALLER THAN THE AVAILABLE IN ES
Log.error("Not expected")
m[coord] = agg[s.pull]
except Exception, e:
@ -115,6 +116,8 @@ def format_table_from_aggop(decoders, aggs, start, query, select):
row = []
for s in select:
if not s.pull:
Log.error("programmer error")
row.append(agg[s.pull])
return Dict(
@ -196,14 +199,23 @@ def format_list_from_aggop(decoders, aggs, start, query, select):
agg = b
b = coalesce(agg._filter, agg._nested)
item = Dict()
for s in select:
item[s.name] = agg[s.pull]
if isinstance(query.select, list):
item = Dict()
for s in select:
item[s.name] = agg[s.pull]
else:
item = agg[select[0].pull]
return wrap({
"meta": {"format": "list"},
"data": [item]
})
if query.edges or query.groupby:
return wrap({
"meta": {"format": "list"},
"data": [item]
})
else:
return wrap({
"meta": {"format": "value"},
"data": item
})

Просмотреть файл

@ -10,108 +10,30 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary import queries
from pyLibrary.collections.matrix import Matrix
from pyLibrary.collections import AND, UNION
from pyLibrary.dot import coalesce, split_field, set_default, Dict, unwraplist, literal_field
from pyLibrary.collections import AND
from pyLibrary.dot import coalesce, split_field, set_default, Dict, unwraplist, literal_field, join_field, unwrap
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import listwrap
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries import domains
from pyLibrary.queries.expressions import qb_expression_to_esfilter, simplify_esfilter, qb_expression_to_ruby
from pyLibrary.maths import Math
from pyLibrary.debugs.logs import Log
from pyLibrary.queries import domains, es14, es09, qb
from pyLibrary.queries.containers.cube import Cube
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.es14.util import qb_sort_to_es_sort
from pyLibrary.queries.expressions import qb_expression_to_esfilter, simplify_esfilter, qb_expression_to_ruby
from pyLibrary.queries.query import DEFAULT_LIMIT
from pyLibrary.times.timer import Timer
from pyLibrary.queries import es14, es09
format_dispatch = {}
def is_fieldop(es, query):
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
return False
# THESE SMOOTH EDGES REQUIRE ALL DATA (SETOP)
select = listwrap(query.select)
if not query.edges:
isDeep = len(split_field(query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT
isSimple = AND(s.value != None and (s.value in ["*", "."] or is_keyword(s.value)) for s in select)
noAgg = AND(s.aggregate == "none" for s in select)
if not isDeep and isSimple and noAgg:
return True
else:
isSmooth = AND((e.domain.type in domains.ALGEBRAIC and e.domain.interval == "none") for e in query.edges)
if isSmooth:
return True
return False
def es_fieldop(es, query):
es_query, es_filter = es14.util.es_query_template(query.frum.name)
es_query[es_filter]=simplify_esfilter(qb_expression_to_esfilter(query.where))
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
es_query.sort = qb_sort_to_es_sort(query.sort)
es_query.fields = DictList()
source = "fields"
select = listwrap(query.select)
for s in select.value:
if s == "*":
es_query.fields=None
source = "_source"
elif s == ".":
es_query.fields=None
source = "_source"
elif isinstance(s, basestring) and is_keyword(s):
es_query.fields.append(s)
elif isinstance(s, list) and es_query.fields is not None:
es_query.fields.extend(s)
elif isinstance(s, Mapping) and es_query.fields is not None:
es_query.fields.extend(s.values())
elif es_query.fields is not None:
es_query.fields.append(s)
es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort]
return extract_rows(es, es_query, source, select, query)
def extract_rows(es, es_query, source, select, query):
with Timer("call to ES") as call_timer:
data = es09.util.post(es, es_query, query.limit)
T = data.hits.hits
for i, s in enumerate(select.copy()):
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
if s.value == "*":
try:
column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"] or c.useSource) and not c.depth)
except Exception, e:
Log.warning("can not get columns", e)
column_names = UNION(*[[k for k, v in row.items()] for row in T.select(source)])
column_names -= set(select.name)
select = select[:i:] + [{"name": n, "value": n} for n in column_names] + select[i + 1::]
break
try:
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
output = formatter(T, select, source)
output.meta.es_response_time = call_timer.duration
output.meta.content_type = mime_type
output.meta.es_query = es_query
return output
except Exception, e:
Log.error("problem formatting", e)
def is_setop(es, query):
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6."])):
if not any(map(es.cluster.version.startswith, ["1.4.", "1.5.", "1.6.", "1.7."])):
return False
select = listwrap(query.select)
@ -133,70 +55,140 @@ def is_setop(es, query):
def es_setop(es, query):
es_query, es_filter = es14.util.es_query_template(query.frum.name)
es_query[es_filter]=simplify_esfilter(qb_expression_to_esfilter(query.where))
es_query[es_filter] = simplify_esfilter(qb_expression_to_esfilter(query.where))
es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
es_query.fields = DictList()
es_query.sort = qb_sort_to_es_sort(query.sort)
es_query.fields = DictList()
return extract_rows(es, es_query, query)
def extract_rows(es, es_query, query):
is_list = isinstance(query.select, list)
new_select = DictList()
column_names = set(c.name for c in query.frum.get_columns() if (c.type not in ["object"]) and not c.nested_path)
source = "fields"
select = listwrap(query.select)
for s in select:
i = 0
for s in listwrap(query.select):
# IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
if s.value == "*":
es_query.fields = None
es_query.script_fields = None
source = "_source"
net_columns = column_names - set(listwrap(query.select).name)
for n in net_columns:
new_select.append({"name": n, "value": n, "put": {"name": n, "index": i, "child": "."}})
i += 1
elif s.value == ".":
es_query.fields = None
es_query.script_fields = None
source = "_source"
new_select.append({"name": s.name if is_list else ".", "value": s.value, "put": {"name": s.name, "index": i, "child": "."}})
i += 1
elif isinstance(s.value, basestring) and s.value.endswith(".*") and is_keyword(s.value[:-2]):
parent = s.value[:-1]
prefix = len(parent)
for c in column_names:
if c.startswith(parent):
if es_query.fields is not None:
es_query.fields.append(c)
new_select.append({"name": s.name+"."+c[prefix:], "value": c, "put": {"name": s.name+"."+c[prefix:], "index": i, "child": "."}})
i += 1
elif isinstance(s.value, basestring) and is_keyword(s.value):
es_query.fields.append(s.value)
elif isinstance(s.value, list) and es_query.fields is not None:
es_query.fields.extend(s.value)
parent = s.value + "."
prefix = len(parent)
net_columns = [c for c in column_names if c.startswith(parent)]
if not net_columns:
if es_query.fields is not None:
es_query.fields.append(s.value)
new_select.append({"name": s.name if is_list else ".", "value": s.value, "put": {"name": s.name, "index": i, "child": "."}})
else:
for n in net_columns:
if es_query.fields is not None:
es_query.fields.append(n)
new_select.append({"name": s.name if is_list else ".", "value": n, "put": {"name": s.name, "index": i, "child": n[prefix:]}})
i += 1
elif isinstance(s.value, list):
Log.error("need an example")
if es_query.fields is not None:
es_query.fields.extend([v for v in s.value])
else:
es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)}
new_select.append({
"name": s.name if is_list else ".",
"pull": "fields." + literal_field(s.name),
"put": {"name": s.name, "index": i, "child": "."}
})
i += 1
return extract_rows(es, es_query, source, select, query)
for n in new_select:
if n.pull:
continue
if source == "_source":
n.pull = join_field(["_source"] + split_field(n.value))
else:
n.pull = "fields." + literal_field(n.value)
with Timer("call to ES") as call_timer:
data = es09.util.post(es, es_query, query.limit)
T = data.hits.hits
try:
formatter, groupby_formatter, mime_type = format_dispatch[query.format]
output = formatter(T, new_select, query)
output.meta.es_response_time = call_timer.duration
output.meta.content_type = mime_type
output.meta.es_query = es_query
return output
except Exception, e:
Log.error("problem formatting", e)
def format_list(T, select, source):
def format_list(T, select, query=None):
data = []
for row in T:
r = Dict(_id=row._id)
r = Dict()
for s in select:
if s.value == ".":
r[s.name] = row[source]
else:
if source=="_source":
r[s.name] = unwraplist(row[source][s.value])
elif isinstance(s.value, basestring): # fields
r[s.name] = unwraplist(row[source][literal_field(s.value)])
else:
r[s.name] = unwraplist(row[source][literal_field(s.name)])
data.append(r)
r[s.name][s.put.child] = unwraplist(row[s.pull])
data.append(r if r else None)
return Dict(
meta={"format": "list"},
data=data
)
def format_table(T, select, source):
header = [s.name for s in select]
map = {s.name: i for i, s in enumerate(select)} # MAP FROM name TO COLUMN INDEX
def format_table(T, select, query=None):
data = []
num_columns = (Math.MAX(select.put.index)+1)
for row in T:
r = [None] * len(header)
r = [None] * num_columns
for s in select:
if s.value == ".":
r[map[s.name]] = row[source]
value = unwraplist(row[s.pull])
if value == None:
continue
index, child = s.put.index, s.put.child
if child == ".":
r[index] = value
else:
if source == "_source":
r[map[s.name]] = unwraplist(row[source][s.value])
elif isinstance(s.value, basestring): # fields
r[map[s.name]] = unwraplist(row[source][literal_field(s.value)])
else:
r[map[s.name]] = unwraplist(row[source][literal_field(s.name)])
if r[index] is None:
r[index] = Dict()
r[index][child] = value
data.append(r)
header = [None]*num_columns
for s in select:
if header[s.put.index]:
continue
header[s.put.index] = s.put.name
return Dict(
meta={"format": "table"},
header=header,
@ -204,26 +196,22 @@ def format_table(T, select, source):
)
def format_cube(T, select, source):
matricies = {}
for s in select:
try:
if s.value == ".":
matricies[s.name] = Matrix.wrap(T.select(source))
elif isinstance(s.value, list):
matricies[s.name] = Matrix.wrap([tuple(unwraplist(t[source][ss]) for ss in s.value) for t in T])
else:
if source == "_source":
matricies[s.name] = Matrix.wrap([unwraplist(t[source][s.value]) for t in T])
def format_cube(T, select, query=None):
table = format_table(T, select, query)
elif isinstance(s.value, basestring): # fields
matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.value)) for t in T])
else:
matricies[s.name] = Matrix.wrap([unwraplist(t[source].get(s.name)) for t in T])
except Exception, e:
Log.error("", e)
cube = Cube(select, edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(T), "interval": 1}}], data=matricies)
return cube
if len(table.data) == 0:
return Cube(
select,
edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": 0, "interval": 1}}],
data={h: Matrix(list=[]) for i, h in enumerate(table.header)}
)
cols = zip(*unwrap(table.data))
return Cube(
select,
edges=[{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(table.data), "interval": 1}}],
data={h: Matrix(list=cols[i]) for i, h in enumerate(table.header)}
)
set_default(format_dispatch, {

Просмотреть файл

@ -11,10 +11,15 @@ from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from pyLibrary.dot import wrap, join_field, split_field
from pyLibrary.dot import wrap, split_field, join_field
def es_query_template(path):
"""
RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE
:param path:
:return:
"""
sub_path = split_field(path)[1:]
if sub_path:
@ -34,24 +39,30 @@ def es_query_template(path):
else:
output = wrap({
"query": {
"filter": {},
"filtered": {
"query": {"match_all": {}},
"filter": {}
}
},
"from": 0,
"size": 0,
"sort": []
})
return output, "query.filter"
return output, "query.filtered.filter"
def qb_sort_to_es_sort(sort):
if not sort:
return []
output = []
for s in sort:
if s.sort == 1:
output.append(s.field)
output.append(s.value)
elif s.sort == -1:
output.append({s.field: "desc"})
output.append({s.value: "desc"})
else:
pass
return output
@ -71,6 +82,8 @@ aggregates1_4 = {
"mean": "avg",
"average": "avg",
"avg": "avg",
"median": "median",
"percentile": "percentile",
"N": "count",
"X0": "count",
"X1": "sum",
@ -81,3 +94,5 @@ aggregates1_4 = {
"variance": "variance"
}
NON_STATISTICAL_AGGS = {"none", "one", "count"}

Просмотреть файл

@ -15,7 +15,7 @@ import itertools
from pyLibrary import convert
from pyLibrary.collections import OR
from pyLibrary.dot import coalesce, wrap, set_default, literal_field
from pyLibrary.dot import coalesce, wrap, set_default, literal_field, listwrap
from pyLibrary.debugs.logs import Log
from pyLibrary.maths import Math
from pyLibrary.queries.domains import is_keyword
@ -25,6 +25,16 @@ from pyLibrary.times.dates import Date
TRUE_FILTER = True
FALSE_FILTER = False
_Query = None
def _late_import():
global _Query
from pyLibrary.queries.query import Query as _Query
_=_Query
def compile_expression(source):
@ -53,7 +63,7 @@ def qb_expression(expr):
def qb_expression_to_function(expr):
if expr!=None and not isinstance(expr, (Mapping, list)) and hasattr(expr, "__call__"):
if expr != None and not isinstance(expr, (Mapping, list)) and hasattr(expr, "__call__"):
return expr
return compile_expression(qb_expression_to_python(expr))
@ -89,7 +99,10 @@ def qb_expression_to_ruby(expr):
elif expr is False:
return "false"
op, term = expr.items()[0]
try:
op, term = expr.items()[0]
except Exception, e:
Log.error("expecting expression (`{op: term}` format)")
mop = ruby_multi_operators.get(op)
if mop:
@ -115,20 +128,15 @@ def qb_expression_to_ruby(expr):
elif isinstance(term, Mapping):
if op == "eq":
# eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE
output = " and ".join("(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")" for a, b in term.items())
output = " and ".join("(" + qb_expression_to_ruby(var) + bop + convert.value2quote(val) + ")" for var, val in term.items())
return output
else:
a, b = term.items()[0]
output = "(" + qb_expression_to_ruby(a) + ")" + bop + "(" + qb_expression_to_ruby(b) + ")"
var, val = term.items()[0]
output = "(" + qb_expression_to_ruby(var) + bop + convert.value2quote(val) + ")"
return output
else:
Log.error("Expecting binary term")
uop = ruby_unary_operators.get(op)
if uop:
output = expand_template(uop, {"term": qb_expression_to_ruby(term)})
return output
cop = complex_operators.get(op)
if cop:
output = cop(term).to_ruby()
@ -144,7 +152,10 @@ def qb_expression_to_python(expr):
return unicode(expr)
elif isinstance(expr, Date):
return unicode(expr.unix)
elif isinstance(expr, unicode):
elif isinstance(expr, basestring):
if isinstance(expr, str):
expr = convert.utf82unicode(expr)
if expr == ".":
return "row"
elif is_keyword(expr):
@ -165,6 +176,8 @@ def qb_expression_to_python(expr):
if isinstance(term, list):
if not term:
return mop[1] # RETURN DEFAULT
elif len(term)==1:
return qb_expression_to_python(term[0])
else:
output = mop[0].join(["(" + qb_expression_to_python(t) + ")" for t in term])
return output
@ -183,26 +196,32 @@ def qb_expression_to_python(expr):
elif isinstance(term, Mapping):
if op == "eq":
# eq CAN ACCEPT A WHOLE OBJECT OF key:value PAIRS TO COMPARE
output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")" for a, b in term.items())
output = " and ".join("(" + qb_expression_to_python(a) + ")" + bop + convert.value2json(b) for a, b in term.items())
return output
else:
a, b = term.items()[0]
output = "(" + qb_expression_to_python(a) + ")" + bop + "(" + qb_expression_to_python(b) + ")"
output = "(" + qb_expression_to_python(a) + ")" + bop + convert.value2json(b)
return output
else:
Log.error("Expecting binary term")
uop = python_unary_operators.get(op)
if uop:
output = uop + "(" + qb_expression_to_python(term) + ")"
cop = complex_operators.get(op)
if cop:
output = cop(op, term).to_python()
return output
Log.error("`{{op}}` is not a recognized operation", op= op)
def get_all_vars(expr):
if not _Query:
_late_import()
if expr == None:
return set()
elif isinstance(expr, _Query):
return query_get_all_vars(expr)
elif isinstance(expr, unicode):
if expr == "." or is_keyword(expr):
return set([expr])
@ -249,10 +268,6 @@ def get_all_vars(expr):
else:
Log.error("Expecting binary term")
uop = ruby_unary_operators.get(op)
if uop:
return get_all_vars(term)
cop = complex_operators.get(op)
if cop:
return cop(op, term).vars()
@ -260,10 +275,134 @@ def get_all_vars(expr):
Log.error("`{{op}}` is not a recognized operation", op= op)
def expression_map(expr, map):
"""
USE map TO MAP VARIABLES NAMES TO SOME OTHER
"""
if expr == None:
return expr
elif Math.is_number(expr):
return expr
elif isinstance(expr, Date):
return expr
elif isinstance(expr, unicode):
if expr == ".":
return expr
elif is_keyword(expr):
return map.get(expr, expr)
else:
Log.error("Expecting a json path")
elif isinstance(expr, CODE):
return expr.code
elif expr is True:
return expr
elif expr is False:
return expr
op, term = expr.items()[0]
mop = python_multi_operators.get(op)
if mop:
output = map(expression_map, term)
return output
bop = python_binary_operators.get(op)
if bop:
if isinstance(term, list):
output = {op: map(expression_map, term)}
return output
elif isinstance(term, Mapping):
output = {op: {expression_map(k, map): v for k, v, in term.items()}}
return output
else:
Log.error("Expecting binary term")
Log.error("`{{op}}` is not a recognized operation", op=op)
def query_get_all_vars(query, exclude_where=False):
"""
:param query:
:param exclude_where: Sometimes we do not what to look at the where clause
:return: all variables in use by query
"""
output = set()
for s in listwrap(query.select):
output |= select_get_all_vars(s)
for s in listwrap(query.edges):
output |= edges_get_all_vars(s)
for s in listwrap(query.groupby):
output |= edges_get_all_vars(s)
if not exclude_where:
output |= get_all_vars(query.where)
return output
def select_get_all_vars(s):
if isinstance(s.value, list):
return set(s.value)
elif isinstance(s.value, basestring):
return set([s.value])
elif s.value == None or s.value == ".":
return set()
else:
if s.value == "*":
return set(["*"])
return get_all_vars(s.value)
def edges_get_all_vars(e):
output = set()
if isinstance(e.value, basestring):
output.add(e.value)
if e.domain.key:
output.add(e.domain.key)
if e.domain.where:
output |= get_all_vars(e.domain.where)
if e.domain.partitions:
for p in e.domain.partitions:
if p.where:
output |= get_all_vars(p.where)
return output
def where_get_all_vars(w):
if w in [True, False, None]:
return []
output = set()
key = list(w.keys())[0]
val = w[key]
if key in ["and", "or"]:
for ww in val:
output |= get_all_vars(ww)
return output
if key == "not":
return get_all_vars(val)
if key in ["exists", "missing"]:
if isinstance(val, unicode):
return {val}
else:
return {val.field}
if key in ["gte", "gt", "eq", "ne", "term", "terms", "lt", "lte", "range", "prefix"]:
if not isinstance(val, Mapping):
Log.error("Expecting `{{key}}` to have a dict value, not a {{type}}",
key= key,
type= val.__class__.__name__)
return val.keys()
if key == "match_all":
return set()
Log.error("do not know how to handle where {{where|json}}", {"where", w})
python_unary_operators = {
"not": "not {{term}}",
}
python_binary_operators = {
"sub": " - ",
@ -282,6 +421,23 @@ python_binary_operators = {
"term": " == "
}
ruby_binary_operators = {
"sub": " - ",
"subtract": " - ",
"minus": " - ",
"div": " / ",
"divide": " / ",
"exp": " ** ",
"mod": " % ",
"gt": " > ",
"gte": " >= ",
"eq": " == ",
"lte": " <= ",
"lt": " < ",
"ne": " != ",
"term": " == "
}
python_multi_operators = {
"add": (" + ", "0"), # (operator, zero-array default value) PAIR
"sum": (" + ", "0"),
@ -292,27 +448,6 @@ python_multi_operators = {
"or": (" or ", "false")
}
ruby_unary_operators = {
"not": "! {{term}}",
}
ruby_binary_operators = {
"sub": " - ",
"subtract": " - ",
"minus": " - ",
"div": " / ",
"divide": " / ",
"exp": " ** ",
"mod": " % ",
"gt": " > ",
"gte": " >= ",
"eq": " == ",
"lte": " <= ",
"lt": " < ",
"ne": " != ",
"term": " == "
}
ruby_multi_operators = {
"add": (" + ", "0"), # (operator, zero-array default value) PAIR
"sum": (" + ", "0"),
@ -334,10 +469,6 @@ default_multi_operators = {
}
class BinaryOp(object):
def __init__(self, op, term):
self.op = op
@ -347,22 +478,23 @@ class BinaryOp(object):
self.a, self.b = map(qb_expression, term.items()[0])
def to_ruby(self):
symbol = ruby_multi_operators[self.op][0]
symbol = ruby_binary_operators[self.op]
return "(" + self.a.to_ruby() + ")" + symbol + "(" + self.b.to_ruby() + ")"
def to_python(self):
symbol = python_multi_operators[self.op][0]
symbol = python_binary_operators[self.op]
return "(" + self.a.to_python() + ")" + symbol + "(" + self.b.to_python() + ")"
def to_esfilter(self):
if self.op in ["gt", "gte", "lte", "lt"]:
return {"range":{self.op: {self.a: self.b}}}
return {"range": {self.op: {self.a: self.b}}}
else:
Log.error("Operator {{op}} is not supported by ES", op=self.op)
def vars(self):
return self.a.vars() | self.b.vars()
class MultiOp(object):
def __init__(self, op, terms):
self.op = op
@ -391,6 +523,35 @@ class MultiOp(object):
return output
_python_unary_operators = {
"not": "not {{term}}",
"length": 'len({{term}})',
"number": 'float({{term}})',
}
_ruby_unary_operators = {
"not": "! {{term}}",
"length": '({{term}}).length()',
"number": '({{term}}).to_f'
}
class UnaryOp(object):
def __init__(self, op, term):
self.op = op
self.term = qb_expression(term)
def to_ruby(self):
pattern = _ruby_unary_operators[self.op]
return expand_template(pattern, {"term": self.term.to_ruby()})
def to_python(self):
pattern = _python_unary_operators[self.op]
return expand_template(pattern, {"term": self.term.to_python()})
def vars(self):
return self.term.vars()
class RegExpOp(object):
def __init__(self, op, term):
self.var, self.pattern = term.items()[0]
@ -420,6 +581,9 @@ class TermsOp(object):
def vars(self):
return {self.var}
def map(self, map):
return {"terms": {map.get(self.var, self.var): self.vals}}
class ExistsOp(object):
def __init__(self, op, term):
@ -440,6 +604,9 @@ class ExistsOp(object):
def vars(self):
return set([self.field])
def map(self, map):
return {"exists": map.get(self.field, self.field)}
class PrefixOp(object):
def __init__(self, op, term):
@ -457,6 +624,9 @@ class PrefixOp(object):
def vars(self):
return set([self.field])
def map(self, map):
return {"prefix": {map.get(self.field, self.field): self.prefix}}
class MissingOp(object):
def __init__(self, op, term):
@ -477,13 +647,16 @@ class MissingOp(object):
def vars(self):
return set([self.field])
def map(self, map):
return {"missing": map.get(self.field, self.field)}
class NotOp(object):
def __init__(self, op, term):
self.term = qb_expression(term)
def to_ruby(self):
return "not " + self.term.to_ruby()
return "! " + self.term.to_ruby()
def to_python(self):
return "not" + self.term.to_python()
@ -494,16 +667,19 @@ class NotOp(object):
def vars(self):
return self.term.vars()
def map(self, map):
return {"not": self.term.map(map)}
class RangeOp(object):
def __init__(self, op, term):
self.field, self.cmp = term.items()[0]
def to_ruby(self):
return " and ".join(qb_expression_to_ruby([{o: {self.field: v}} for o, v in self.cmp.items()]))
return " and ".join(qb_expression_to_ruby({"and": [{o: {self.field: v}} for o, v in self.cmp.items()]}))
def to_python(self):
return " and ".join(qb_expression_to_python([{o: {self.field: v}} for o, v in self.cmp.items()]))
return " and ".join(qb_expression_to_python({"and": [{o: {self.field: v}} for o, v in self.cmp.items()]}))
def to_esfilter(self):
return {"range": {self.field, self.cmp}}
@ -511,16 +687,19 @@ class RangeOp(object):
def vars(self):
return set([self.field])
def map(self, map):
return {"range": {map.get(self.field, self.field): self.cmp}}
class DocOp(object):
"""
A literal JSON document
"""
def __init__(self, term):
def __init__(self, op, term):
self.json = convert.value2json(term)
def to_ruby(self):
def _convert(v, depth):
def _convert(v):
if v is None:
return "nil"
if v is True:
@ -532,19 +711,11 @@ class DocOp(object):
if isinstance(v, (int, long, float)):
return unicode(v)
if isinstance(v, dict):
var_name = "output" + unicode(depth)
return \
"lambda {\n" + var_name + "={};\n" + \
"".join(
"" + var_name + "[" + convert.string2quote(k) + "]=" + _convert(vv, depth + 1) + ";\n" for k, vv in v.items()
) + \
" return " + var_name + ";\n}.call\n"
return "{" + ", ".join(convert.string2quote(k) + "=>" + _convert(vv) for k, vv in v.items()) + "}"
if isinstance(v, list):
return "[" + ", ".join(_convert(vv, depth+1) for vv in v) + "]"
return "[" + ", ".join(_convert(vv) for vv in v) + "]"
# { output={}; output["failure_classification"]="intermittent"; yield output; }
return _convert(convert.json_decoder(self.json), 0)
return _convert(convert.json_decoder(self.json))
def to_python(self):
return self.json
@ -561,6 +732,9 @@ class DocOp(object):
complex_operators = {
"not": NotOp,
"length": UnaryOp,
"number": UnaryOp,
"terms": TermsOp,
"exists": ExistsOp,
"missing": MissingOp,

Просмотреть файл

Просмотреть файл

@ -18,7 +18,6 @@ from pyLibrary.dot import listwrap
from pyLibrary.queries import windows
from pyLibrary.queries.containers.cube import Cube
from pyLibrary.queries.domains import SimpleSetDomain, DefaultDomain
# from pyLibrary.queries.py.util import util_filter
from pyLibrary.queries.expressions import qb_expression_to_function

Просмотреть файл

481
pyLibrary/queries/meta.py Normal file
Просмотреть файл

@ -0,0 +1,481 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http:# mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from copy import copy
from pyLibrary import convert
from pyLibrary.env import elasticsearch
from pyLibrary.env.elasticsearch import ES_NUMERIC_TYPES
from pyLibrary.meta import use_settings
from pyLibrary.queries import qb
from pyLibrary.queries.containers import Container
from pyLibrary.queries.domains import NumericDomain, SimpleSetDomain, UniqueDomain
from pyLibrary.queries.query import Query
from pyLibrary.debugs.logs import Log
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import coalesce, set_default, Null, literal_field
from pyLibrary.dot import wrap
from pyLibrary.strings import expand_template
from pyLibrary.thread.threads import Queue, Thread, Lock, Till
from pyLibrary.times.dates import Date
from pyLibrary.times.durations import HOUR, MINUTE
DEBUG = True
TOO_OLD = 2*HOUR
singlton = None
class FromESMetadata(Container):
"""
QUERY THE METADATA
"""
def __new__(cls, *args, **kwargs):
global singlton
if singlton:
return singlton
else:
singlton = object.__new__(cls)
return singlton
@use_settings
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
if hasattr(self, "settings"):
return
from pyLibrary.queries.containers.lists import ListContainer
Container.__init__(self, None, schema=self)
self.settings = settings
self.default_name = coalesce(name, alias, index)
self.default_es = elasticsearch.Cluster(settings=settings)
self.locker = Lock("")
self.todo = Queue("refresh metadata")
table_columns = metadata_tables()
column_columns = metadata_columns()
self.tables = ListContainer([], wrap({c.name: c for c in table_columns}))
self.columns = ListContainer([], wrap({c.name: c for c in column_columns}))
self.columns.insert(column_columns)
self.columns.insert(table_columns)
self.worker = Thread.run("refresh metadata", self.monitor)
return
@property
def query_path(self):
return None
@property
def url(self):
return self.default_es.path + "/" + self.default_name.replace(".", "/")
def get_table(self, table_name):
with self.locker:
return self.tables.query({"where": {"eq": {"name": table_name}}})
def upsert_column(self, c):
existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data)
if not existing_columns:
self.columns.add(c)
cols = filter(lambda r: r.table == "meta.columns", self.columns.data)
for c in cols:
c.partitions = c.cardinality = c.last_updated = None
self.todo.add(c)
self.todo.extend(cols)
else:
set_default(existing_columns[0], c)
self.todo.add(existing_columns[0])
def _get_columns(self, table=None):
# TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
alias_done = set()
metadata = self.default_es.get_metadata(index=table)
for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
for _, properties in meta.mappings.items():
columns = elasticsearch.parse_properties(index, None, properties.properties)
with self.locker:
for c in columns:
# ABSOLUTE
c.table = index
# c.domain = DefaultDomain()
self.upsert_column(c)
for alias in meta.aliases:
# ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
if alias in alias_done:
continue
alias_done.add(alias)
c = copy(c)
c.table = alias
self.upsert_column(c)
def query(self, _query):
return self.columns.query(Query(set_default(
{
"from": self.columns,
"sort": ["table", "name"]
},
_query.as_dict()
)))
def get_columns(self, table):
"""
RETURN METADATA COLUMNS
"""
with self.locker:
columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
if columns:
return columns
self._get_columns(table=table)
with self.locker:
columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
if columns:
return columns
# self._get_columns(table=table)
Log.error("no columns for {{table}}", table=table)
def _update_cardinality(self, c):
"""
QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
"""
if c.type in ["object", "nested"]:
Log.error("not supported")
if c.table == "meta.columns":
with self.locker:
partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
self.columns.update({
"set": {
"partitions": partitions,
"cardinality": len(partitions),
"last_updated": Date.now()
},
"where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
})
return
if c.table == "meta.tables":
with self.locker:
partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
self.columns.update({
"set": {
"partitions": partitions,
"cardinality": len(partitions),
"last_updated": Date.now()
},
"where": {"eq": {"table": c.table, "name": c.name}}
})
return
result = self.default_es.post("/"+c.table+"/_search", data={
"aggs": {c.name: _counting_query(c)},
"size": 0
})
r = result.aggregations.values()[0]
cardinaility = coalesce(r.value, r._nested.value)
query = Dict(size=0)
if c.type in ["object", "nested"]:
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
with self.locker:
self.columns.update({
"set": {
"cardinality": cardinaility,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"table": c.table, "name": c.name}}
})
return
elif c.cardinality > 1000:
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
with self.locker:
self.columns.update({
"set": {
"cardinality": cardinaility,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"table": c.table, "name": c.name}}
})
return
elif c.type in ES_NUMERIC_TYPES and c.cardinality > 30:
Log.note("{{field}} has {{num}} parts", field=c.name, num=c.cardinality)
with self.locker:
self.columns.update({
"set": {
"cardinality": cardinaility,
"last_updated": Date.now()
},
"clear": ["partitions"],
"where": {"eq": {"table": c.table, "name": c.name}}
})
return
elif c.nested_path:
query.aggs[literal_field(c.name)] = {
"nested": {"path": c.nested_path[0]},
"aggs": {"_nested": {"terms": {"field": c.name, "size": 0}}}
}
else:
query.aggs[literal_field(c.name)] = {"terms": {"field": c.name, "size": 0}}
result = self.default_es.post("/"+c.table+"/_search", data=query)
aggs = result.aggregations.values()[0]
if aggs._nested:
parts = qb.sort(aggs._nested.buckets.key)
else:
parts = qb.sort(aggs.buckets.key)
Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
with self.locker:
self.columns.update({
"set": {
"cardinality": cardinaility,
"partitions": parts,
"last_updated": Date.now()
},
"where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
})
def monitor(self, please_stop):
while not please_stop:
if not self.todo:
with self.locker:
old_columns = filter(lambda c: (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in ["object", "nested"], self.columns)
if old_columns:
self.todo.extend(old_columns)
else:
Log.note("no more metatdata to update")
column = self.todo.pop(timeout=10*MINUTE)
if column:
if column.type in ["object", "nested"]:
continue
if column.last_updated >= Date.now()-TOO_OLD:
continue
self._update_cardinality(column)
Log.note("updated {{column.name}}", column=column)
def _counting_query(c):
if c.nested_path:
return {
"nested": {
"path": c.nested_path[0] # FIRST ONE IS LONGEST
},
"aggs": {
"_nested": {"cardinality": {
"field": c.name,
"precision_threshold": 10 if c.type in ES_NUMERIC_TYPES else 100
}}
}
}
else:
return {"cardinality": {
"field": c.name
}}
def metadata_columns():
return wrap(
[
Column(
table="meta.columns",
name=c,
abs_name=c,
type="string",
nested_path=Null,
)
for c in [
"name",
"type",
"nested_path",
"relative",
"abs_name",
"table"
]
] + [
Column(
table="meta.columns",
name=c,
abs_name=c,
type="object",
nested_path=Null,
)
for c in [
"domain",
"partitions"
]
] + [
Column(
table="meta.columns",
name=c,
abs_name=c,
type="long",
nested_path=Null,
)
for c in [
"count",
"cardinality"
]
] + [
Column(
table="meta.columns",
name="last_updated",
abs_name="last_updated",
type="time",
nested_path=Null,
)
]
)
def metadata_tables():
return wrap(
[
Column(
table="meta.tables",
name=c,
abs_name=c,
type="string",
nested_path=Null
)
for c in [
"name",
"url",
"query_path"
]
]
)
def DataClass(name, columns):
"""
Each column has {"name", "required", "nulls", "default"} properties
"""
columns = wrap([{"name": c, "required": True, "nulls": False} if isinstance(c, basestring) else c for c in columns])
slots = columns.name
required = wrap(filter(lambda c: c.required and not c.nulls and not c.default, columns)).name
nulls = wrap(filter(lambda c: c.nulls, columns)).name
code = expand_template("""
from __future__ import unicode_literals
from collections import Mapping
class {{name}}(Mapping):
__slots__ = {{slots}}
def __init__(self, **kwargs):
if not kwargs:
return
for s in {{slots}}:
setattr(self, s, kwargs.get(s, kwargs.get('default', Null)))
missed = {{required}}-set(kwargs.keys())
if missed:
Log.error("Expecting properties {"+"{missed}}", missed=missed)
illegal = set(kwargs.keys())-set({{slots}})
if illegal:
Log.error("{"+"{names}} are not a valid properties", names=illegal)
def __getitem__(self, item):
return getattr(self, item)
def __setitem__(self, item, value):
setattr(self, item, value)
return self
def __setattr__(self, item, value):
if item not in {{slots}}:
Log.error("{"+"{item|quote}} not valid attribute", item=item)
object.__setattr__(self, item, value)
def __getattr__(self, item):
Log.error("{"+"{item|quote}} not valid attribute", item=item)
def items(self):
return ((k, getattr(self, k)) for k in {{slots}})
def __copy__(self):
_set = object.__setattr__
output = object.__new__(Column)
{{assign}}
return output
def __iter__(self):
return {{slots}}.__iter__()
def __len__(self):
return {{len_slots}}
def __str__(self):
return str({{dict}})
temp = {{name}}
""",
{
"name": name,
"slots": "(" + (", ".join(convert.value2quote(s) for s in slots)) + ")",
"required": "{" + (", ".join(convert.value2quote(s) for s in required)) + "}",
"nulls": "{" + (", ".join(convert.value2quote(s) for s in nulls)) + "}",
"len_slots": len(slots),
"dict": "{" + (", ".join(convert.value2quote(s) + ": self." + s for s in slots)) + "}",
"assign": "; ".join("_set(output, "+convert.value2quote(s)+", self."+s+")" for s in slots)
}
)
return _exec(code)
def _exec(code):
temp = None
exec(code)
return temp
class Table(DataClass("Table", [
"name",
"url",
"query_path"
])):
@property
def columns(self):
return FromESMetadata.singlton.get_columns(table=self.name)
Column = DataClass(
"Column",
[
"name",
"abs_name",
"table",
"type",
{"name": "useSource", "default": False},
{"name": "nested_path", "nulls": True}, # AN ARRAY OF PATHS (FROM DEEPEST TO SHALLOWEST) INDICATING THE JSON SUB-ARRAYS
{"name": "relative", "nulls": True},
{"name": "count", "nulls": True},
{"name": "cardinality", "nulls": True},
{"name": "partitions", "nulls": True},
{"name": "last_updated", "nulls": True}
]
)

Просмотреть файл

@ -0,0 +1,59 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary.dot import set_default, Dict
from pyLibrary.queries.query import Query
class Namespace(object):
def convert(self, expr):
raise NotImplementedError()
def _convert_query(self, query):
output = Query()
output.select = self._convert_clause(query.select)
output.where = self.convert(query.where)
output["from"] = self._convert_from(query["from"])
output.edges = self._convert_clause(query.edges)
output.having = convert_list(self._convert_having, query.having)
output.window = convert_list(self._convert_window, query.window)
output.sort = self._convert_clause(query.sort)
output.format = query.format
return output
def _convert_from(self, frum):
raise NotImplementedError()
def _convert_clause(self, clause):
raise NotImplementedError()
def _convert_having(self, clause):
raise NotImplementedError()
def _convert_window(self, clause):
raise NotImplementedError()
def convert_list(operator, operand):
if operand==None:
return None
elif isinstance(operand, Mapping):
return operator(operand)
else:
return map(operator, operand)

Просмотреть файл

@ -0,0 +1,283 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from copy import copy
from pyLibrary.debugs.logs import Log
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import coalesce, Null
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, listwrap
from pyLibrary.maths import Math
from pyLibrary.queries.containers import Container
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.domains import Domain
from pyLibrary.queries.expressions import TRUE_FILTER
from pyLibrary.queries.namespace import Namespace, convert_list
from pyLibrary.queries.query import Query, get_all_vars
DEFAULT_LIMIT = 10
class Normal(Namespace):
"""
UNREMARKABLE NAMESPACE, SIMPLY FOR CONVERTING QUERY TO NORMAL FORM
"""
def convert(self, expr):
if isinstance(expr, Mapping) and expr["from"]:
return self._convert_query(expr)
return expr
def _convert_query(self, query):
# if not isinstance(query["from"], Container):
# Log.error('Expecting from clause to be a Container')
query = wrap(query)
output = Query()
output["from"] = self._convert_from(query["from"])
output.format = query.format
if query.select:
output.select = convert_list(self._convert_select, query.select)
else:
if query.edges or query.groupby:
output.select = {"name": "count", "value": ".", "aggregate": "count"}
else:
output.select = {"name": "__all__", "value": "*", "aggregate": "none"}
if query.groupby and query.edges:
Log.error("You can not use both the `groupby` and `edges` clauses in the same query!")
elif query.edges:
output.edges = convert_list(self._convert_edge, query.edges)
output.groupby = None
elif query.groupby:
output.edges = None
output.groupby = convert_list(self._convert_group, query.groupby)
else:
output.edges = []
output.groupby = None
output.where = self.convert(query.where)
output.window = convert_list(self._convert_window, query.window)
output.sort = self._convert_sort(query.sort)
output.limit = coalesce(query.limit, DEFAULT_LIMIT)
if not Math.is_integer(output.limit) or output.limit < 0:
Log.error("Expecting limit >= 0")
output.isLean = query.isLean
# DEPTH ANALYSIS - LOOK FOR COLUMN REFERENCES THAT MAY BE DEEPER THAN
# THE from SOURCE IS.
vars = get_all_vars(output, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
for c in query.columns:
if c.name in vars and c.nested_path:
Log.error("This query, with variable {{var_name}} is too deep", var_name=c.name)
output.having = convert_list(self._convert_having, query.having)
return output
def _convert_from(self, frum):
if isinstance(frum, basestring):
return Dict(name=frum)
elif isinstance(frum, (Container, Query)):
return frum
else:
Log.error("Expecting from clause to be a name, or a container")
def _convert_select(self, select):
if isinstance(select, basestring):
return Dict(
name=select.rstrip("."), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
value=select,
aggregate="none"
)
else:
select = wrap(select)
output = copy(select)
if not select.value or isinstance(select.value, basestring):
if select.value == ".":
output.name = coalesce(select.name, select.aggregate)
else:
output.name = coalesce(select.name, select.value, select.aggregate)
elif not output.name:
Log.error("Must give name to each column in select clause")
if not output.name:
Log.error("expecting select to have a name: {{select}}", select=select)
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
return output
def _convert_edge(self, edge):
if isinstance(edge, basestring):
return Dict(
name=edge,
value=edge,
domain=self._convert_domain()
)
else:
edge = wrap(edge)
if not edge.name and not isinstance(edge.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= edge)
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
# COMPLEX EDGE IS SHORT HAND
domain =self._convert_domain()
domain.dimension = Dict(fields=edge.value)
return Dict(
name=edge.name,
allowNulls=False if edge.allowNulls is False else True,
domain=domain
)
domain = self._convert_domain(edge.domain)
return Dict(
name=coalesce(edge.name, edge.value),
value=edge.value,
range=edge.range,
allowNulls=False if edge.allowNulls is False else True,
domain=domain
)
def _convert_group(self, column):
if isinstance(column, basestring):
return wrap({
"name": column,
"value": column,
"domain": {"type": "default"}
})
else:
column = wrap(column)
if (column.domain and column.domain.type != "default") or column.allowNulls != None:
Log.error("groupby does not accept complicated domains")
if not column.name and not isinstance(column.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= column)
return wrap({
"name": coalesce(column.name, column.value),
"value": column.value,
"domain": {"type": "default"}
})
def _convert_domain(self, domain=None):
if not domain:
return Domain(type="default")
elif isinstance(domain, Dimension):
return domain.getDomain()
elif isinstance(domain, Domain):
return domain
if not domain.name:
domain = domain.copy()
domain.name = domain.type
if not isinstance(domain.partitions, list):
domain.partitions = list(domain.partitions)
return Domain(**domain)
def _convert_range(self, range):
if range == None:
return None
return Dict(
min=range.min,
max=range.max
)
def _convert_where(self, where):
if where == None:
return TRUE_FILTER
return where
def _convert_window(self, window):
return Dict(
name=coalesce(window.name, window.value),
value=window.value,
edges=[self._convert_edge(e) for e in listwrap(window.edges)],
sort=self._convert_sort(window.sort),
aggregate=window.aggregate,
range=self._convert_range(window.range),
where=self._convert_where(window.where)
)
def _convert_sort(self, sort):
return normalize_sort(sort)
def normalize_sort(sort=None):
"""
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
"""
if not sort:
return DictList.EMPTY
output = DictList()
for s in listwrap(sort):
if isinstance(s, basestring) or Math.is_integer(s):
output.append({"value": s, "sort": 1})
elif not s.field and not s.value and s.sort==None:
#ASSUME {name: sort} FORM
for n, v in s.items():
output.append({"value": n, "sort": sort_direction[v]})
else:
output.append({"value": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)})
return wrap(output)
sort_direction = {
"asc": 1,
"desc": -1,
"none": 0,
1: 1,
0: 0,
-1: -1,
None: 1,
Null: 1
}
canonical_aggregates = {
"none": "none",
"one": "one",
"count": "count",
"sum": "sum",
"add": "sum",
"mean": "average",
"average": "average",
"avg": "average",
"min": "minimum",
"minimum": "minimum",
"max": "maximum",
"maximum": "minimum",
"X2": "sum_of_squares",
"std": "std",
"stddev": "std",
"std_deviation": "std",
"var": "variance",
"variance": "variance",
"stats": "stats"
}

Просмотреть файл

@ -0,0 +1,140 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from copy import copy
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, wrap, coalesce, Dict, listwrap, unwraplist
from pyLibrary.maths import Math
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.namespace import Namespace, convert_list
from pyLibrary.queries.query import Query
from pyLibrary.times.dates import Date
class Rename(Namespace):
def __init__(self, dimensions, source):
"""
EXPECTING A LIST OF {"name":name, "value":value} OBJECTS TO PERFORM A MAPPING
"""
dimensions = wrap(dimensions)
if isinstance(dimensions, Mapping) and dimensions.name == None:
# CONVERT TO A REAL DIMENSION DEFINITION
dimensions = {"name": ".", "type": "set", "edges":[{"name": k, "field": v} for k, v in dimensions.items()]}
self.dimensions = Dimension(dimensions, None, source)
def convert(self, expr):
"""
EXPAND INSTANCES OF name TO value
"""
if expr is True or expr == None or expr is False:
return expr
elif Math.is_number(expr):
return expr
elif expr == ".":
return "."
elif is_keyword(expr):
return coalesce(self.dimensions[expr], expr)
elif isinstance(expr, basestring):
Log.error("{{name|quote}} is not a valid variable name", name=expr)
elif isinstance(expr, Date):
return expr
elif isinstance(expr, Query):
return self._convert_query(expr)
elif isinstance(expr, Mapping):
if expr["from"]:
return self._convert_query(expr)
elif len(expr) >= 2:
#ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
return wrap({name: self.convert(value) for name, value in expr.leaves()})
else:
# ASSUME SINGLE-CLAUSE EXPRESSION
k, v = expr.items()[0]
return converter_map.get(k, self._convert_bop)(self, k, v)
elif isinstance(expr, (list, set, tuple)):
return wrap([self.convert(value) for value in expr])
else:
return expr
def _convert_query(self, query):
output = Query(None)
output.select = self._convert_clause(query.select)
output.where = self.convert(query.where)
output.frum = self._convert_from(query.frum)
output.edges = convert_list(self._convert_edge, query.edges)
output.having = convert_list(self._convert_having, query.having)
output.window = convert_list(self._convert_window, query.window)
output.sort = self._convert_clause(query.sort)
output.format = query.format
return output
def _convert_bop(self, op, term):
if isinstance(term, list):
return {op: map(self.convert, term)}
return {op: {self.convert(var): val for var, val in term.items()}}
def _convert_many(self, k, v):
return {k: map(self.convert, v)}
def _convert_from(self, frum):
if isinstance(frum, Mapping):
return Dict(name=self.convert(frum.name))
else:
return self.convert(frum)
def _convert_edge(self, edge):
dim = self.dimensions[edge.value]
if not dim:
return edge
if len(listwrap(dim.fields)) == 1:
#TODO: CHECK IF EDGE DOMAIN AND DIMENSION DOMAIN CONFLICT
new_edge = set_default({"value": unwraplist(dim.fields)}, edge)
return new_edge
new_edge.domain = dim.getDomain()
edge = copy(edge)
edge.value = None
edge.domain = dim.getDomain()
return edge
def _convert_clause(self, clause):
"""
Qb QUERIES HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS
"""
clause = wrap(clause)
if clause == None:
return None
elif isinstance(clause, Mapping):
return set_default({"value": self.convert(clause.value)}, clause)
else:
return [set_default({"value": self.convert(c.value)}, c) for c in clause]
converter_map = {
"and": Rename._convert_many,
"or": Rename._convert_many,
"not": Rename.convert,
"missing": Rename.convert,
"exists": Rename.convert
}

Просмотреть файл

@ -0,0 +1,111 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, wrap, Dict, Null
from pyLibrary.maths import Math
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.namespace import convert_list, Namespace
from pyLibrary.queries.query import Query
from pyLibrary.times.dates import Date
class Typed(Namespace):
def __init__(self):
self.converter_map = {
"and": self._convert_many,
"or": self._convert_many,
"not": self.convert,
"missing": self.convert,
"exists": self.convert
}
def convert(self, expr):
"""
ADD THE ".$value" SUFFIX TO ALL VARIABLES
"""
if expr is True or expr == None or expr is False:
return expr
elif Math.is_number(expr):
return expr
elif expr == ".":
return "."
elif is_keyword(expr):
#TODO: LOOKUP SCHEMA AND ADD ALL COLUMNS WITH THIS PREFIX
return expr + ".$value"
elif isinstance(expr, basestring):
Log.error("{{name|quote}} is not a valid variable name", name=expr)
elif isinstance(expr, Date):
return expr
elif isinstance(expr, Query):
return self._convert_query(expr)
elif isinstance(expr, Mapping):
if expr["from"]:
return self._convert_query(expr)
elif len(expr) >= 2:
#ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
return wrap({name: self.convert(value) for name, value in expr.items()})
else:
# ASSUME SINGLE-CLAUSE EXPRESSION
k, v = expr.items()[0]
return self.converter_map.get(k, self._convert_bop)(k, v)
elif isinstance(expr, (list, set, tuple)):
return wrap([self.convert(value) for value in expr])
def _convert_query(self, query):
output = Query(Null)
output.select = self._convert_clause(query.select)
output.where = self.convert(query.where)
output.frum = self._convert_from(query.frum)
output.edges = self._convert_clause(query.edges)
output.groupby = self._convert_clause(query.groupby)
output.window = convert_list(self._convert_window, query.window)
output.having = convert_list(self._convert_having, query.having)
output.sort = self._convert_clause(query.sort)
output.limit = query.limit
output.format = query.format
return output
def _convert_clause(self, clause):
"""
Qb QUERIES HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS
"""
if clause == None:
return None
elif isinstance(clause, Mapping):
return set_default({"value": self.convert(clause["value"])}, clause)
else:
return [set_default({"value": self.convert(c.value)}, c) for c in clause]
def _convert_from(self, frum):
return frum
def _convert_having(self, having):
raise NotImplementedError()
def _convert_window(self, window):
raise NotImplementedError()
def _convert_many(self, k, v):
return {k: map(self.convert, v)}
def _convert_bop(self, op, term):
if isinstance(term, list):
return {op: map(self.convert, term)}
return {op: {var: val for var, val in term.items()}}

Просмотреть файл

@ -1,424 +0,0 @@
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary.collections import AND, reverse
from pyLibrary.debugs.logs import Log
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import coalesce, split_field, join_field, Null
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, unwrap, listwrap
from pyLibrary.maths import Math
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.domains import Domain, is_keyword
from pyLibrary.queries.expressions import TRUE_FILTER, simplify_esfilter
DEFAULT_LIMIT = 10
_qb = None
_INDEX_CACHE = None
def _late_import():
global _qb
global _INDEX_CACHE
from pyLibrary.queries import qb as _qb
from pyLibrary.queries.es09.util import INDEX_CACHE as _INDEX_CACHE
_ = _qb
_ = _INDEX_CACHE
def _normalize_selects(selects, schema=None):
if isinstance(selects, list):
output = wrap([_normalize_select(s, schema=schema) for s in selects])
exists = set()
for s in output:
if s.name in exists:
Log.error("{{name}} has already been defined", name= s.name)
exists.add(s.name)
return output
else:
return _normalize_select(selects, schema=schema)
def _normalize_select(select, schema=None):
if isinstance(select, basestring):
if schema:
s = schema[select]
if s:
return s.getSelect()
return Dict(
name=select.rstrip("."), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
value=select,
aggregate="none"
)
else:
select = wrap(select)
output = select.copy()
if not select.value or isinstance(select.value, basestring):
output.name = coalesce(select.name, select.value, select.aggregate)
elif not output.name:
Log.error("Must give name to each column in select clause")
if not output.name:
Log.error("expecting select to have a name: {{select}}", select=select)
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
return output
def _normalize_edges(edges, schema=None):
return [_normalize_edge(e, schema=schema) for e in listwrap(edges)]
def _normalize_edge(edge, schema=None):
if isinstance(edge, basestring):
if schema:
e = schema[edge]
if e:
if isinstance(e.fields, list) and len(e.fields) == 1:
return Dict(
name=e.name,
value=e.fields[0],
domain=e.getDomain()
)
else:
return Dict(
name=e.name,
domain=e.getDomain()
)
return Dict(
name=edge,
value=edge,
domain=_normalize_domain(schema=schema)
)
else:
edge = wrap(edge)
if not edge.name and not isinstance(edge.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= edge)
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
# COMPLEX EDGE IS SHORT HAND
domain = _normalize_domain(schema=schema)
domain.dimension = Dict(fields=edge.value)
return Dict(
name=edge.name,
allowNulls=False if edge.allowNulls is False else True,
domain=domain
)
domain = _normalize_domain(edge.domain, schema=schema)
return Dict(
name=coalesce(edge.name, edge.value),
value=edge.value,
range=edge.range,
allowNulls=False if edge.allowNulls is False else True,
domain=domain
)
def _normalize_groupby(groupby, schema=None):
if groupby == None:
return None
return [_normalize_group(e, schema=schema) for e in listwrap(groupby)]
def _normalize_group(edge, schema=None):
if isinstance(edge, basestring):
return wrap({
"name": edge,
"value": edge,
"domain": {"type": "default"}
})
else:
edge = wrap(edge)
if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
Log.error("groupby does not accept complicated domains")
if not edge.name and not isinstance(edge.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= edge)
return wrap({
"name": coalesce(edge.name, edge.value),
"value": edge.value,
"domain": {"type": "default"}
})
def _normalize_domain(domain=None, schema=None):
if not domain:
return Domain(type="default")
elif isinstance(domain, Dimension):
return domain.getDomain()
elif schema and isinstance(domain, basestring) and schema[domain]:
return schema[domain].getDomain()
elif isinstance(domain, Domain):
return domain
if not domain.name:
domain = domain.copy()
domain.name = domain.type
if not isinstance(domain.partitions, list):
domain.partitions = list(domain.partitions)
return Domain(**domain)
def _normalize_range(range):
if range == None:
return None
return Dict(
min=range.min,
max=range.max
)
def _normalize_where(where, schema=None):
if where == None:
return TRUE_FILTER
if schema == None:
return where
where = simplify_esfilter(_where_terms(where, where, schema))
return where
def _normalize_window(window, schema=None):
return Dict(
name=coalesce(window.name, window.value),
value=window.value,
edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)],
sort=_normalize_sort(window.sort),
aggregate=window.aggregate,
range=_normalize_range(window.range),
where=_normalize_where(window.where, schema=schema)
)
def _map_term_using_schema(master, path, term, schema_edges):
"""
IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM
"""
output = DictList()
for k, v in term.items():
dimension = schema_edges[k]
if isinstance(dimension, Dimension):
domain = dimension.getDomain()
if dimension.fields:
if isinstance(dimension.fields, Mapping):
# EXPECTING A TUPLE
for local_field, es_field in dimension.fields.items():
local_value = v[local_field]
if local_value == None:
output.append({"missing": {"field": es_field}})
else:
output.append({"term": {es_field: local_value}})
continue
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
# SIMPLE SINGLE-VALUED FIELD
if domain.getPartByKey(v) is domain.NULL:
output.append({"missing": {"field": dimension.fields[0]}})
else:
output.append({"term": {dimension.fields[0]: v}})
continue
if AND(is_keyword(f) for f in dimension.fields):
# EXPECTING A TUPLE
if not isinstance(v, tuple):
Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v)
for i, f in enumerate(dimension.fields):
vv = v[i]
if vv == None:
output.append({"missing": {"field": f}})
else:
output.append({"term": {f: vv}})
continue
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
if domain.getPartByKey(v) is domain.NULL:
output.append({"missing": {"field": dimension.fields[0]}})
else:
output.append({"term": {dimension.fields[0]: v}})
continue
if domain.partitions:
part = domain.getPartByKey(v)
if part is domain.NULL or not part.esfilter:
Log.error("not expected to get NULL")
output.append(part.esfilter)
continue
else:
Log.error("not expected")
elif isinstance(v, Mapping):
sub = _map_term_using_schema(master, path + [k], v, schema_edges[k])
output.append(sub)
continue
output.append({"term": {k: v}})
return {"and": output}
def _move_nested_term(master, where, schema):
"""
THE WHERE CLAUSE CAN CONTAIN NESTED PROPERTY REFERENCES, THESE MUST BE MOVED
TO A NESTED FILTER
"""
items = where.term.items()
if len(items) != 1:
Log.error("Expecting only one term")
k, v = items[0]
nested_path = _get_nested_path(k, schema)
if nested_path:
return {"nested": {
"path": nested_path,
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"and": [
{"term": {k: v}}
]}
}}
}}
return where
def _get_nested_path(field, schema):
if not _INDEX_CACHE:
_late_import()
if is_keyword(field):
field = join_field([schema.es.alias] + split_field(field))
for i, f in reverse(enumerate(split_field(field))):
path = join_field(split_field(field)[0:i + 1:])
if path in _INDEX_CACHE:
return join_field(split_field(path)[1::])
return None
def _where_terms(master, where, schema):
"""
USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
"""
if isinstance(where, Mapping):
if where.term:
# MAP TERM
try:
output = _map_term_using_schema(master, [], where.term, schema.edges)
return output
except Exception, e:
Log.error("programmer problem?", e)
elif where.terms:
# MAP TERM
output = DictList()
for k, v in where.terms.items():
if not isinstance(v, (list, set)):
Log.error("terms filter expects list of values")
edge = schema.edges[k]
if not edge:
output.append({"terms": {k: v}})
else:
if isinstance(edge, basestring):
# DIRECT FIELD REFERENCE
return {"terms": {edge: v}}
try:
domain = edge.getDomain()
except Exception, e:
Log.error("programmer error", e)
fields = domain.dimension.fields
if isinstance(fields, Mapping):
or_agg = []
for vv in v:
and_agg = []
for local_field, es_field in fields.items():
vvv = vv[local_field]
if vvv != None:
and_agg.append({"term": {es_field: vvv}})
or_agg.append({"and": and_agg})
output.append({"or": or_agg})
elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]):
output.append({"terms": {fields[0]: v}})
elif domain.partitions:
output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]})
return {"and": output}
elif where["or"]:
return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]}
elif where["and"]:
return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]}
elif where["not"]:
return {"not": unwrap(_where_terms(master, where["not"], schema))}
return where
def _normalize_sort(sort=None):
"""
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
"""
if not sort:
return DictList.EMPTY
output = DictList()
for s in listwrap(sort):
if isinstance(s, basestring) or Math.is_integer(s):
output.append({"field": s, "sort": 1})
elif not s.field and not s.value and s.sort==None:
#ASSUME {name: sort} FORM
for n, v in s.items():
output.append({"field": n, "sort": sort_direction[v]})
else:
output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)})
return wrap(output)
sort_direction = {
"asc": 1,
"desc": -1,
"none": 0,
1: 1,
0: 0,
-1: -1,
None: 1,
Null: 1
}
canonical_aggregates = {
"none": "none",
"one": "one",
"count": "value_count",
"sum": "sum",
"add": "sum",
"mean": "average",
"average": "average",
"avg": "average",
"min": "minimum",
"minimum": "minimum",
"max": "maximum",
"maximum": "minimum",
"X2": "sum_of_squares",
"std": "std",
"stddev": "std",
"std_deviation": "std",
"var": "variance",
"variance": "variance",
"stats": "stats"
}

Просмотреть файл

@ -21,34 +21,82 @@ from pyLibrary.debugs.logs import Log
from pyLibrary.dot import set_default, Null, Dict, split_field, coalesce, join_field
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import listwrap, wrap, unwrap
from pyLibrary.dot.objects import DictObject
from pyLibrary.dot.objects import DictClass, DictObject
from pyLibrary.maths import Math
from pyLibrary.queries import flat_list, query, group_by
from pyLibrary.queries.containers import Container
from pyLibrary.queries.cubes.aggs import cube_aggs
from pyLibrary.queries.expressions import TRUE_FILTER, FALSE_FILTER, compile_expression, qb_expression_to_function
from pyLibrary.queries.expressions import TRUE_FILTER, FALSE_FILTER, compile_expression, qb_expression_to_function, qb_expression_to_python
from pyLibrary.queries.flat_list import FlatList
from pyLibrary.queries.index import Index
from pyLibrary.queries.query import Query, _normalize_selects, sort_direction, _normalize_select
from pyLibrary.queries.containers.cube import Cube
from pyLibrary.queries.normalize import _normalize_sort, _normalize_select, _normalize_selects
from pyLibrary.queries.query import Query
from pyLibrary.queries.unique_index import UniqueIndex
# A COLLECTION OF DATABASE OPERATORS (RELATIONAL ALGEBRA OPERATORS)
# qb QUERY DOCUMENTATION: https://github.com/klahnakoski/qb/tree/master/docs
# START HERE: https://github.com/klahnakoski/qb/blob/master/docs/Qb_Reference.md
# TODO: USE http://docs.sqlalchemy.org/en/latest/core/tutorial.html AS DOCUMENTATION FRAMEWORK
def run(query):
def run(query, frum=None):
"""
THIS FUNCTION IS SIMPLY SWITCHING BASED ON THE query["from"] CONTAINER,
BUT IT IS ALSO PROCESSING A list CONTAINER; SEPARATE TO A ListContainer
"""
frum = Container.new_instance(query["from"])
q = Query(query, frum)
return frum.query(q)
query = Query(query)
frum = coalesce(frum, query["from"])
if isinstance(frum, Container):
return frum.query(query)
elif isinstance(frum, (list, set, GeneratorType)):
frum = wrap(list(frum))
elif isinstance(frum, Cube):
if is_aggs(query):
return cube_aggs(frum, query)
elif isinstance(frum, Query):
frum = run(frum).data
else:
Log.error("Do not know how to handle {{type}}", type=frum.__class__.__name__)
if is_aggs(query):
frum = list_aggs(frum, query)
else: # SETOP
# try:
# if query.filter != None or query.esfilter != None:
# Log.error("use 'where' clause")
# except AttributeError:
# pass
if query.where is not TRUE_FILTER:
frum = filter(frum, query.where)
if query.sort:
frum = sort(frum, query.sort)
if query.select:
frum = select(frum, query.select)
if query.window:
if isinstance(frum, Cube):
frum = list(frum.values())
for param in query.window:
window(frum, param)
# AT THIS POINT frum IS IN LIST FORMAT, NOW PACKAGE RESULT
if query.format == "cube":
frum = convert.list2cube(frum)
elif query.format == "table":
frum = convert.list2table(frum)
frum.meta.format = "table"
else:
frum = wrap({
"meta": {"format": "list"},
"data": frum
})
return frum
groupby = group_by.groupby
@ -221,12 +269,13 @@ def select_one(record, selection):
output = Dict()
for f in selection:
f = _normalize_select(f)
output[f.name]=record[f.value]
output[f.name] = record[f.value]
return output
else:
Log.error("Do not know how to handle")
def select(data, field_name):
"""
return list with values from field_name
@ -395,8 +444,8 @@ def _select_deep_meta(field, depth):
return assign
# def get_columns(data):
# return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
def get_columns(data):
return wrap([{"name": n} for n in UNION(set(d.keys()) for d in data)])
def sort(data, fieldnames=None):
@ -410,19 +459,29 @@ def sort(data, fieldnames=None):
if fieldnames == None:
return wrap(sorted(data))
fieldnames = _normalize_sort(fieldnames)
fieldnames = listwrap(fieldnames)
if len(fieldnames) == 1:
fieldnames = fieldnames[0]
# SPECIAL CASE, ONLY ONE FIELD TO SORT BY
if fieldnames.field == ".":
if fieldnames == ".":
return wrap(sorted(data))
if isinstance(fieldnames, (basestring, int)):
fieldnames = wrap({"value": fieldnames, "sort": 1})
# EXPECTING {"field":f, "sort":i} FORMAT
fieldnames.sort = sort_direction.get(fieldnames.sort, 1)
fieldnames.value = coalesce(fieldnames.value, fieldnames.field)
if fieldnames.value==None:
Log.error("Expecting sort to have 'value' attribute")
if fieldnames.value == ".":
#VALUE COMPARE
def _compare_v(l, r):
return value_compare(l, r, fieldnames.sort)
return DictList([unwrap(d) for d in sorted(data, cmp=_compare_v)])
else:
def _compare_o(left, right):
return value_compare(coalesce(left)[fieldnames.field], coalesce(right)[fieldnames.field], fieldnames.sort)
return value_compare(coalesce(left)[fieldnames.value], coalesce(right)[fieldnames.value], fieldnames.sort)
return DictList([unwrap(d) for d in sorted(data, cmp=_compare_o)])
formal = query._normalize_sort(fieldnames)
@ -432,7 +491,7 @@ def sort(data, fieldnames=None):
right = coalesce(right)
for f in formal:
try:
result = value_compare(left[f.field], right[f.field], f.sort)
result = value_compare(left[f.value], right[f.value], f.sort)
if result != 0:
return result
except Exception, e:
@ -449,7 +508,7 @@ def sort(data, fieldnames=None):
return output
except Exception, e:
Log.error("Problem sorting\n{{data}}", data= data, cause=e)
Log.error("Problem sorting\n{{data}}", data=data, cause=e)
def value_compare(l, r, ordering=1):
@ -491,9 +550,13 @@ def filter(data, where):
if isinstance(data, Cube):
data.filter(where)
temp = None
exec("def temp(row):\n return "+qb_expression_to_python(where))
return data.filter(temp)
try:
return drill_filter(where, data)
except Exception, e:
except Exception, _:
# WOW! THIS IS INEFFICIENT!
return wrap([unwrap(d) for d in drill_filter(where, [DictObject(d) for d in data])])
@ -516,7 +579,10 @@ def drill_filter(esfilter, data):
col = split_field(fieldname)
d = data
for i, c in enumerate(col):
d = d[c]
try:
d = d[c]
except Exception, e:
Log.error("{{name}} does not exist", name=fieldname)
if isinstance(d, list) and len(col) > 1:
if len(primary_column) <= depth+i:
primary_nested.append(True)
@ -581,10 +647,11 @@ def drill_filter(esfilter, data):
return True
else:
return {"not": f}
elif filter.term:
elif filter.term or filter.eq:
eq = coalesce(filter.term, filter.eq)
result = True
output = {}
for col, val in filter["term"].items():
for col, val in eq.items():
first, rest = parse_field(col, data, depth)
d = data[first]
if not rest:
@ -896,4 +963,4 @@ def reverse(vals):
return wrap(output)
from pyLibrary.queries.list.aggs import is_aggs, list_aggs
from pyLibrary.queries.lists.aggs import is_aggs, list_aggs

Просмотреть файл

@ -10,24 +10,28 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from copy import copy
from collections import Mapping
from pyLibrary import convert
from pyLibrary.env import elasticsearch, http
from pyLibrary.meta import use_settings
from pyLibrary.queries import qb, expressions
from pyLibrary.queries.containers import Container, config
from pyLibrary.queries import qb, expressions, containers
from pyLibrary.queries.containers import Container
from pyLibrary.queries.domains import is_keyword
from pyLibrary.queries.es09 import setop as es09_setop
from pyLibrary.queries.es09.util import parse_columns, INDEX_CACHE
from pyLibrary.queries.es14.aggs import es_aggsop, is_aggsop
from pyLibrary.queries.es14.setop import is_fieldop, is_setop, es_setop, es_fieldop
from pyLibrary.queries.es14.deep import is_deepop, es_deepop
from pyLibrary.queries.es14.setop import is_setop, es_setop
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.es14.util import aggregates1_4
from pyLibrary.queries.meta import FromESMetadata
from pyLibrary.queries.namespace.typed import Typed
from pyLibrary.queries.query import Query, _normalize_where
from pyLibrary.debugs.logs import Log, Except
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import coalesce, split_field, set_default, literal_field, unwraplist
from pyLibrary.dot import coalesce, split_field, literal_field, unwraplist, join_field
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, listwrap
@ -43,24 +47,27 @@ class FromES(Container):
output.__init__(*args, **kwargs)
return output
else:
output = object.__new__(cls)
output.schema = None #TODO: WHERE IS THE SCHEMA?
return output
return Container.__new__(cls)
@use_settings
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, settings=None):
if not config.default:
config.default.settings = settings
Container.__init__(self, None, None)
if not containers.config.default:
containers.config.default.settings = settings
self.settings = settings
self.name = coalesce(name, alias, index)
if read_only:
self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings)
else:
self._es = elasticsearch.Cluster(settings=settings).get_index(read_only=read_only, settings=settings)
self.meta = FromESMetadata(settings=settings)
self.settings.type = self._es.settings.type
self.schema = Dict()
self.edges = Dict()
self.worker = None
self._columns = None
self._columns = self.get_columns()
# SWITCH ON TYPED MODE
self.typed = any(c.name in ("$value", "$object") for c in self._columns)
@staticmethod
def wrap(es):
@ -91,15 +98,26 @@ class FromES(Container):
else:
self.worker.join()
@property
def query_path(self):
return join_field(split_field(self.name)[1:])
@property
def url(self):
return self._es.url
def query(self, query):
def query(self, _query):
try:
query = Query(_query, schema=self)
for n in self.namespaces:
query = n.convert(query)
if self.typed:
query = Typed().convert(query)
for s in listwrap(query.select):
if not aggregates1_4[s.aggregate]:
Log.error("ES can not aggregate " + self.select[0].name + " because '" + self.select[0].aggregate + "' is not a recognized aggregate")
if not aggregates1_4.get(s.aggregate):
Log.error("ES can not aggregate " + s.name + " because '" + s.aggregate + "' is not a recognized aggregate")
frum = query["from"]
if isinstance(frum, Query):
@ -108,10 +126,10 @@ class FromES(Container):
q2.frum = result
return qb.run(q2)
if is_deepop(self._es, query):
return es_deepop(self._es, query)
if is_aggsop(self._es, query):
return es_aggsop(self._es, frum, query)
if is_fieldop(self._es, query):
return es_fieldop(self._es, query)
if is_setop(self._es, query):
return es_setop(self._es, query)
if es09_setop.is_setop(query):
@ -125,60 +143,47 @@ class FromES(Container):
Log.error("Problem (Tried to clear Elasticsearch cache)", e)
Log.error("problem", e)
def get_columns(self, table=None):
query_path = self.query_path if self.query_path != "." else None
abs_columns = self.meta.get_columns(table=coalesce(table, self.settings.index))
columns = []
if query_path:
depth = (len(c.nested_path) for c in abs_columns if c.nested_path[0] == query_path).next()
# ADD RELATIVE COLUMNS
for c in abs_columns:
if c.nested_path[0] == query_path:
c = copy(c)
columns.append(c)
c = copy(c)
c.name = c.abs_name[len(query_path) + 1:] if c.type != "nested" else "."
c.relative = True
columns.append(c)
elif not c.nested_path:
c = copy(c)
columns.append(c)
c = copy(c)
c.name = "." + ("." * depth) + c.abs_name
c.relative = True
columns.append(c)
elif depth > len(c.nested_path) and query_path.startswith(c.nested_path[0] + "."):
diff = depth - len(c.nested_path)
c = copy(c)
columns.append(c)
c = copy(c)
c.name = "." + ("." * diff) + (c.abs_name[len(c.nested_path[0]) + 1:] if c.type != "nested" else "")
c.relative = True
columns.append(c)
else:
continue
else:
for c in abs_columns:
if not c.nested_path:
c = copy(c)
c.relative = True
columns.append(c)
def get_relative_columns(self):
if self._columns:
return self._columns
abs_columns=self._get_columns(self.settings.alias, self.path)
def get_columns(self, _from_name=None):
"""
ENSURE COLUMNS FOR GIVEN INDEX/QUERY ARE LOADED, SCRIPT COMPILATION WILL WORK BETTER
_from_name - NOT MEANT FOR EXTERNAL USE
"""
if _from_name is None:
_from_name = self.name
if not isinstance(_from_name, basestring):
Log.error("Expecting string")
output = INDEX_CACHE.get(_from_name)
if output:
# VERIFY es IS CONSISTENT
if self.url != output.url:
Log.error("Using {{name}} for two different containers\n\t{{existing}}\n\t{{new}}",
name= _from_name,
existing= output.url,
new= self._es.url)
return output.columns
path = split_field(_from_name)
if len(path) > 1:
# LOAD THE PARENT (WHICH WILL FILL THE INDEX_CACHE WITH NESTED CHILDREN)
self.get_columns(_from_name=path[0])
return INDEX_CACHE[_from_name].columns
schema = self._es.get_schema()
properties = schema.properties
INDEX_CACHE[_from_name] = output = Dict()
output.name = _from_name
output.url = self._es.url
output.columns = parse_columns(_from_name, properties)
return output.columns
def get_column_names(self):
# GET METADATA FOR INDEX
# LIST ALL COLUMNS
frum = self.get_columns()
return frum.name
return wrap(columns)
def addDimension(self, dim):
if isinstance(dim, list):
@ -189,14 +194,14 @@ class FromES(Container):
dim.full_name = dim.name
for e in dim.edges:
d = Dimension(e, dim, self)
self.schema[d.full_name] = d
self.edges[d.full_name] = d
def __getitem__(self, item):
e = self.schema[item]
e = self.edges[item]
return e
def __getattr__(self, item):
return self.schema[item]
return self.edges[item]
def normalize_edges(self, edges):
output = DictList()
@ -257,23 +262,15 @@ class FromES(Container):
"size": 200000
})
# SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
scripts = DictList()
for k, v in command.set.items():
if not is_keyword(k):
Log.error("Only support simple paths for now")
if "doc" in v.keys():
# scripts.append({
# "script": "ctx._source[" + convert.string2quote(k) + "] = param_",
# "params": {"param_": v["doc"]}
# })
#SIMPLE DOC ASSIGNMENT
scripts.append({"doc": {k: v["doc"]}})
if isinstance(v, Mapping) and v.doc:
scripts.append({"doc": v.doc})
else:
# SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
scripts.append({
"script": "ctx._source[" + convert.string2quote(k) + "] = " + expressions.qb_expression_to_ruby(v) + ";\n"
})
scripts.append({"script": "ctx._source." + k + " = " + expressions.qb_expression_to_ruby(v)})
if results.hits.hits:
updates = []
@ -282,7 +279,7 @@ class FromES(Container):
updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}})
updates.append(s)
content = ("\n".join(convert.value2json(c) for c in updates) + "\n").encode('utf-8')
response = self._es.cluster._post(
response = self._es.cluster.post(
self._es.path + "/_bulk",
data=content,
headers={"Content-Type": "application/json"}
@ -290,97 +287,3 @@ class FromES(Container):
if response.errors:
Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
class FromESMetadata(Container):
"""
QUERY THE METADATA
"""
@use_settings
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
self.settings = settings
self.name = coalesce(name, alias, index)
self._es = elasticsearch.Cluster(settings=settings)
self.metadata = self._es.get_metadata()
self.columns = None
@property
def url(self):
return self._es.path + "/" + self.name.replace(".", "/")
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
pass
def query(self, _query):
if not self.columns:
self.columns = []
alias_done = set()
metadata = self._es.get_metadata()
for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
for _, properties in meta.mappings.items():
columns = _parse_properties(index, properties.properties)
for c in columns:
c.cube = index
c.property = c.name
c.name = None
c.useSource = None
self.columns.extend(columns)
for a in meta.aliases:
# ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
if a in alias_done:
continue
alias_done.add(a)
for c in columns:
self.columns.append(set_default({"cube": a}, c)) # ENSURE WE COPY
return qb.run(set_default(
{
"from": self.columns,
"sort": ["cube", "property"]
},
_query.as_dict()
))
def get_columns(self, _=None):
"""
RETURN METADATA COLUMNS
"""
if self.name == "meta.columns":
return wrap([
{
"name": "cube",
"type": "string",
"depth": 0
}, {
"name": "column",
"type": "string",
"depth": 0
}, {
"name": "type",
"type": "string",
"depth": 0
}, {
"name": "depth",
"type": "integer",
"depth": 0
}
])
else:
Log.error("Unknonw metadata: {{name}}", name= self.settings.name)
def _parse_properties(index, properties):
"""
ISOLATE THE DEALING WITH THE INDEX_CACHE,
INDEX_CACHE IS REDUNDANT WHEN YOU HAVE metadata.columns
"""
backup = INDEX_CACHE.get(index)
INDEX_CACHE[index] = output = Dict()
output.name = index
columns = parse_columns(index, properties)
INDEX_CACHE[index] = backup
return columns

Просмотреть файл

@ -10,63 +10,93 @@
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from collections import Mapping
from pyLibrary.collections import AND, reverse
from pyLibrary.debugs.logs import Log
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import coalesce
from pyLibrary.dot import wrap, listwrap
from pyLibrary.dot import coalesce, split_field, join_field, Null
from pyLibrary.dot.lists import DictList
from pyLibrary.dot import wrap, unwrap, listwrap
from pyLibrary.maths import Math
from pyLibrary.queries import expressions
from pyLibrary.queries import wrap_from
from pyLibrary.queries.containers import Container
from pyLibrary.queries.normalize import _normalize_groupby, _normalize_edges, _normalize_where, _normalize_window, _normalize_sort, DEFAULT_LIMIT, _normalize_selects
from pyLibrary.queries.dimensions import Dimension
from pyLibrary.queries.domains import Domain, is_keyword
from pyLibrary.queries.expressions import TRUE_FILTER, simplify_esfilter, query_get_all_vars
DEFAULT_LIMIT = 10
qb = None
def _late_import():
global qb
from pyLibrary.queries import qb
_ = qb
class Query(object):
__slots__ = ["frum", "select", "edges", "groupby", "where", "window", "sort", "limit", "format", "isLean"]
__slots__ = ["frum", "select", "edges", "groupby", "where", "window", "sort", "limit", "having", "format", "isLean"]
def __new__(cls, query, frum):
def __new__(cls, query, schema=None):
if isinstance(query, Query):
return query
return object.__new__(cls)
output = object.__new__(cls)
for s in Query.__slots__:
setattr(output, s, None)
return output
def __init__(self, query, frum):
def __init__(self, query, schema=None):
"""
NORMALIZE QUERY SO IT CAN STILL BE JSON
"""
object.__init__(self)
if isinstance(query, Query):
if isinstance(query, Query) or query == None:
return
object.__init__(self)
query = wrap(query)
self.frum = frum
if not isinstance(self.frum, Container):
Log.error('Expecting from clause to be a Container')
self.format = query.format
self.frum = wrap_from(query["from"], schema=schema)
if query.select:
self.select = _normalize_selects(query.select, frum.schema)
select = query.select
if isinstance(select, list):
names = set()
new_select = []
for s in select:
ns = _normalize_select(s, schema=schema)
if ns.name in names:
Log.error("two select have the same name")
names.add(ns.name)
new_select.append(unwrap(ns))
self.select = wrap(new_select)
elif select:
self.select = _normalize_select(select, schema=schema)
else:
if query.edges or query.groupby:
self.select = {"name": "count", "value": ".", "aggregate": "count"}
else:
self.select = {"name": "__all__", "value": "*", "aggregate": "none"}
self.select = {"name": ".", "value": "*", "aggregate": "none"}
if query.groupby and query.edges:
Log.error("You can not use both the `groupby` and `edges` clauses in the same query!")
elif query.edges:
self.edges = _normalize_edges(query.edges, schema=self.frum.schema)
self.edges = _normalize_edges(query.edges, schema=schema)
self.groupby = None
elif query.groupby:
self.edges = None
self.groupby = _normalize_groupby(query.groupby, schema=self.frum.schema)
self.groupby = _normalize_groupby(query.groupby, schema=schema)
else:
self.edges = []
self.groupby = None
self.where = _normalize_where(query.where, schema=self.frum.schema)
self.where = _normalize_where(query.where, schema=schema)
self.window = [_normalize_window(w) for w in listwrap(query.window)]
self.having = None
self.sort = _normalize_sort(query.sort)
self.limit = coalesce(query.limit, DEFAULT_LIMIT)
if not Math.is_integer(self.limit) or self.limit < 0:
@ -77,9 +107,20 @@ class Query(object):
# DEPTH ANALYSIS - LOOK FOR COLUMN REFERENCES THAT MAY BE DEEPER THAN
# THE from SOURCE IS.
vars = get_all_vars(self, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
for c in self.columns:
if c.name in vars and c.depth:
# TODO: IGNORE REACHING INTO THE NON-NESTED TYPES
if isinstance(self.frum, list):
if not qb:
_late_import()
columns = qb.get_columns(self.frum)
elif isinstance(self.frum, Container):
columns = self.frum.get_columns(table=query["from"])
else:
columns = []
query_path = coalesce(self.frum.query_path, "")
vars = query_get_all_vars(self, exclude_where=True) # WE WILL EXCLUDE where VARIABLES
for c in columns:
if c.name in vars and not query_path.startswith(coalesce(c.nested_path[0], "")):
Log.error("This query, with variable {{var_name}} is too deep", var_name=c.name)
@property
@ -102,48 +143,381 @@ class Query(object):
return output
def get_all_vars(query, exclude_where=False):
"""
:param query:
:param exclude_where: Sometimes we do not what to look at the where clause
:return: all variables in use by query
"""
output = []
for s in listwrap(query.select):
output.extend(select_get_all_vars(s))
for s in listwrap(query.edges):
output.extend(edges_get_all_vars(s))
for s in listwrap(query.groupby):
output.extend(edges_get_all_vars(s))
if not exclude_where:
output.extend(expressions.get_all_vars(query.where))
return output
canonical_aggregates = {
"min": "minimum",
"max": "maximum",
"add": "sum",
"avg": "average",
"mean": "average"
}
def select_get_all_vars(s):
if isinstance(s.value, list):
return set(s.value)
elif isinstance(s.value, basestring):
return set([s.value])
elif s.value == None or s.value == ".":
return set()
def _normalize_selects(selects, schema=None):
if isinstance(selects, list):
output = wrap([_normalize_select(s, schema=schema) for s in selects])
exists = set()
for s in output:
if s.name in exists:
Log.error("{{name}} has already been defined", name= s.name)
exists.add(s.name)
return output
else:
if s.value == "*":
return set(["*"])
return expressions.get_all_vars(s.value)
return _normalize_select(selects, schema=schema)
def edges_get_all_vars(e):
output = []
if isinstance(e.value, basestring):
output.append(e.value)
if e.domain.key:
output.append(e.domain.key)
if e.domain.where:
output.extend(expressions.get_all_vars(e.domain.where))
if e.domain.partitions:
for p in e.domain.partitions:
if p.where:
output.extend(expressions.get_all_vars(p.where))
return output
def _normalize_select(select, schema=None):
if isinstance(select, basestring):
select = select.rstrip(".")
if not select:
return Dict(
name=".",
value="*",
aggregate="none"
)
if schema:
s = schema[select]
if s:
return s.getSelect()
if select.endswith(".*"):
name = select[:-2]
else:
name = select
return Dict(
name=name,
value=select,
aggregate="none"
)
else:
select = wrap(select)
output = select.copy()
if not select.value or isinstance(select.value, basestring):
if select.value == ".":
output.name = coalesce(select.name, select.aggregate)
else:
output.name = coalesce(select.name, select.value, select.aggregate)
elif not output.name:
Log.error("Must give name to each column in select clause")
if not output.name:
Log.error("expecting select to have a name: {{select}}", select= select)
if output.name.endswith(".*"):
output.name = output.name[:-2]
output.aggregate = coalesce(canonical_aggregates.get(select.aggregate), select.aggregate, "none")
return output
def _normalize_edges(edges, schema=None):
return [_normalize_edge(e, schema=schema) for e in listwrap(edges)]
def _normalize_edge(edge, schema=None):
if isinstance(edge, basestring):
if schema:
e = schema[edge]
if e:
if isinstance(e.fields, list) and len(e.fields) == 1:
return Dict(
name=e.name,
value=e.fields[0],
allowNulls=True,
domain=e.getDomain()
)
else:
return Dict(
name=e.name,
allowNulls=True,
domain=e.getDomain()
)
return Dict(
name=edge,
value=edge,
allowNulls=True,
domain=_normalize_domain(schema=schema)
)
else:
edge = wrap(edge)
if not edge.name and not isinstance(edge.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= edge)
if isinstance(edge.value, (Mapping, list)) and not edge.domain:
# COMPLEX EDGE IS SHORT HAND
domain = _normalize_domain(schema=schema)
domain.dimension = Dict(fields=edge.value)
return Dict(
name=edge.name,
allowNulls=bool(coalesce(edge.allowNulls, True)),
domain=domain
)
domain = _normalize_domain(edge.domain, schema=schema)
return Dict(
name=coalesce(edge.name, edge.value),
value=edge.value,
range=edge.range,
allowNulls=bool(coalesce(edge.allowNulls, True)),
domain=domain
)
def _normalize_groupby(groupby, schema=None):
if groupby == None:
return None
return [_normalize_group(e, schema=schema) for e in listwrap(groupby)]
def _normalize_group(edge, schema=None):
if isinstance(edge, basestring):
return wrap({
"name": edge,
"value": edge,
"domain": {"type": "default"}
})
else:
edge = wrap(edge)
if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None:
Log.error("groupby does not accept complicated domains")
if not edge.name and not isinstance(edge.value, basestring):
Log.error("You must name compound edges: {{edge}}", edge= edge)
return wrap({
"name": coalesce(edge.name, edge.value),
"value": edge.value,
"domain": {"type": "default"}
})
def _normalize_domain(domain=None, schema=None):
if not domain:
return Domain(type="default")
elif isinstance(domain, Dimension):
return domain.getDomain()
elif schema and isinstance(domain, basestring) and schema[domain]:
return schema[domain].getDomain()
elif isinstance(domain, Domain):
return domain
if not domain.name:
domain = domain.copy()
domain.name = domain.type
if not isinstance(domain.partitions, list):
domain.partitions = list(domain.partitions)
return Domain(**domain)
def _normalize_window(window, schema=None):
return Dict(
name=coalesce(window.name, window.value),
value=window.value,
edges=[_normalize_edge(e, schema) for e in listwrap(window.edges)],
sort=_normalize_sort(window.sort),
aggregate=window.aggregate,
range=_normalize_range(window.range),
where=_normalize_where(window.where, schema=schema)
)
def _normalize_range(range):
if range == None:
return None
return Dict(
min=range.min,
max=range.max
)
def _normalize_where(where, schema=None):
if where == None:
return TRUE_FILTER
if schema == None:
return where
where = simplify_esfilter(_where_terms(where, where, schema))
return where
def _map_term_using_schema(master, path, term, schema_edges):
"""
IF THE WHERE CLAUSE REFERS TO FIELDS IN THE SCHEMA, THEN EXPAND THEM
"""
output = DictList()
for k, v in term.items():
dimension = schema_edges[k]
if isinstance(dimension, Dimension):
domain = dimension.getDomain()
if dimension.fields:
if isinstance(dimension.fields, Mapping):
# EXPECTING A TUPLE
for local_field, es_field in dimension.fields.items():
local_value = v[local_field]
if local_value == None:
output.append({"missing": {"field": es_field}})
else:
output.append({"term": {es_field: local_value}})
continue
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
# SIMPLE SINGLE-VALUED FIELD
if domain.getPartByKey(v) is domain.NULL:
output.append({"missing": {"field": dimension.fields[0]}})
else:
output.append({"term": {dimension.fields[0]: v}})
continue
if AND(is_keyword(f) for f in dimension.fields):
# EXPECTING A TUPLE
if not isinstance(v, tuple):
Log.error("expecing {{name}}={{value}} to be a tuple", name= k, value= v)
for i, f in enumerate(dimension.fields):
vv = v[i]
if vv == None:
output.append({"missing": {"field": f}})
else:
output.append({"term": {f: vv}})
continue
if len(dimension.fields) == 1 and is_keyword(dimension.fields[0]):
if domain.getPartByKey(v) is domain.NULL:
output.append({"missing": {"field": dimension.fields[0]}})
else:
output.append({"term": {dimension.fields[0]: v}})
continue
if domain.partitions:
part = domain.getPartByKey(v)
if part is domain.NULL or not part.esfilter:
Log.error("not expected to get NULL")
output.append(part.esfilter)
continue
else:
Log.error("not expected")
elif isinstance(v, Mapping):
sub = _map_term_using_schema(master, path + [k], v, schema_edges[k])
output.append(sub)
continue
output.append({"term": {k: v}})
return {"and": output}
def _move_nested_term(master, where, schema):
"""
THE WHERE CLAUSE CAN CONTAIN NESTED PROPERTY REFERENCES, THESE MUST BE MOVED
TO A NESTED FILTER
"""
items = where.term.items()
if len(items) != 1:
Log.error("Expecting only one term")
k, v = items[0]
nested_path = _get_nested_path(k, schema)
if nested_path:
return {"nested": {
"path": nested_path,
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"and": [
{"term": {k: v}}
]}
}}
}}
return where
def _get_nested_path(field, schema):
if is_keyword(field):
field = join_field([schema.es.alias] + split_field(field))
for i, f in reverse(enumerate(split_field(field))):
path = join_field(split_field(field)[0:i + 1:])
if path in INDEX_CACHE:
return join_field(split_field(path)[1::])
return None
def _where_terms(master, where, schema):
"""
USE THE SCHEMA TO CONVERT DIMENSION NAMES TO ES FILTERS
master - TOP LEVEL WHERE (FOR PLACING NESTED FILTERS)
"""
if isinstance(where, Mapping):
if where.term:
# MAP TERM
try:
output = _map_term_using_schema(master, [], where.term, schema.edges)
return output
except Exception, e:
Log.error("programmer problem?", e)
elif where.terms:
# MAP TERM
output = DictList()
for k, v in where.terms.items():
if not isinstance(v, (list, set)):
Log.error("terms filter expects list of values")
edge = schema.edges[k]
if not edge:
output.append({"terms": {k: v}})
else:
if isinstance(edge, basestring):
# DIRECT FIELD REFERENCE
return {"terms": {edge: v}}
try:
domain = edge.getDomain()
except Exception, e:
Log.error("programmer error", e)
fields = domain.dimension.fields
if isinstance(fields, Mapping):
or_agg = []
for vv in v:
and_agg = []
for local_field, es_field in fields.items():
vvv = vv[local_field]
if vvv != None:
and_agg.append({"term": {es_field: vvv}})
or_agg.append({"and": and_agg})
output.append({"or": or_agg})
elif isinstance(fields, list) and len(fields) == 1 and is_keyword(fields[0]):
output.append({"terms": {fields[0]: v}})
elif domain.partitions:
output.append({"or": [domain.getPartByKey(vv).esfilter for vv in v]})
return {"and": output}
elif where["or"]:
return {"or": [unwrap(_where_terms(master, vv, schema)) for vv in where["or"]]}
elif where["and"]:
return {"and": [unwrap(_where_terms(master, vv, schema)) for vv in where["and"]]}
elif where["not"]:
return {"not": unwrap(_where_terms(master, where["not"], schema))}
return where
def _normalize_sort(sort=None):
"""
CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
"""
if not sort:
return DictList.EMPTY
output = DictList()
for s in listwrap(sort):
if isinstance(s, basestring) or Math.is_integer(s):
output.append({"value": s, "sort": 1})
else:
output.append({"value": coalesce(s.value, s.field), "sort": coalesce(sort_direction[s.sort], 1)})
return wrap(output)
sort_direction = {
"asc": 1,
"desc": -1,
"none": 0,
1: 1,
0: 0,
-1: -1,
None: 1,
Null: 1
}

Просмотреть файл

@ -405,6 +405,7 @@ class MySQL(object):
)
@staticmethod
@use_settings
def execute_file(
filename,
host,
@ -424,7 +425,7 @@ class MySQL(object):
except Exception, e:
pass
else:
MySQL.execute_sql(settings, sql, param)
MySQL.execute_sql(sql=sql, param=param, settings=settings)
def _execute_backlog(self):
if not self.backlog: return

Просмотреть файл

@ -55,7 +55,7 @@ def unix(value):
def url(value):
"""
CONVERT FROM dict OR string TO URL PARAMETERS
_CONVERT FROM dict OR string TO URL PARAMETERS
"""
if not _convert:
_late_import()
@ -65,7 +65,7 @@ def url(value):
def html(value):
"""
CONVERT FROM unicode TO HTML OF THE SAME
_CONVERT FROM unicode TO HTML OF THE SAME
"""
if not _convert:
_late_import()
@ -553,14 +553,14 @@ def utf82unicode(value):
_late_import()
if not isinstance(value, basestring):
_Log.error("Can not convert {{type}} to unicode because it's not a string", type= type(value).__name__)
_Log.error("Can not _convert {{type}} to unicode because it's not a string", type= type(value).__name__)
e = _Except.wrap(e)
for i, c in enumerate(value):
try:
c.decode("utf8")
except Exception, f:
_Log.error("Can not convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)])
_Log.error("Can not _convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)])
try:
latin1 = unicode(value.decode("latin1"))

Просмотреть файл

@ -18,6 +18,8 @@ from pyLibrary.env.files import File
from pyLibrary.queries import qb
from pyLibrary.dot.dicts import Dict
from pyLibrary.dot import unwrap, wrap
from pyLibrary.queries.expressions import qb_expression_to_function
def make_test_instance(name, settings):
if settings.filename:
@ -56,7 +58,7 @@ class Fake_ES():
def search(self, query):
query=wrap(query)
f = convert.esfilter2where(query.query.filtered.filter)
f = qb_expression_to_function(query.query.filtered.filter)
filtered=wrap([{"_id": i, "_source": d} for i, d in self.data.items() if f(d)])
if query.fields:
return wrap({"hits": {"total":len(filtered), "hits": [{"_id":d._id, "fields":unwrap(qb.select([unwrap(d._source)], query.fields)[0])} for d in filtered]}})

Просмотреть файл

@ -8,13 +8,12 @@
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from collections import Mapping
import unittest
from pyLibrary import dot
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import coalesce, Dict, literal_field
from pyLibrary.dot import coalesce, literal_field
from pyLibrary.maths import Math
from pyLibrary.dot import wrap
from pyLibrary.queries.unique_index import UniqueIndex
from pyLibrary.strings import expand_template
@ -68,7 +67,7 @@ def zipall(*args):
while True:
output = zip(*(_next(a) for a in iters))
if all(output[0]):
return
raise StopIteration
else:
yield output[1]

Просмотреть файл

@ -10,33 +10,19 @@ Module `threads`
The main distinction between this library and Python's is two-fold:
1. **Multi-threaded queues do not use serialization** - Serialization is great in the general case, where you may also be communicating between processes, but it is a needless overhead for single-process multi-threading. It is left to the programmer to ensure the messages put on the queue are not changed, which is not ominous demand.
2. **Shutdown order is deterministic and explicit** - Python's threading library is missing strict conventions for controlled and orderly shutdown.
2. **Shutdown order is deterministic and explicit, if desired** - If there is one aspect of threading library that is missing, it will be a lack of controlled and orderly shutdown.
* All threads are required to accept a `please_stop` token and are expected to test for its signal in a timely manner and exit when signalled.
* All threads have a parent - The parent is responsible for ensuring their children get the `please_stop` signal, and are dead, before stopping themselves.
* All threads have a parent, which are ultimately responsible for ensuring their children get the `please_stop` signal, and are dead before stopping themselves.
These conventions eliminate the need for `interrupt()` and `abort()`, both of which are unstable idioms when there are resources. Each thread can shutdown on its own terms, but is expected to do so expediently.
###What's it used for###
A good amount of time is spent waiting for underlying C libraries and OS
services to respond to network and file access requests. Multiple
threads can make your code faster despite the GIL when dealing with those
requests. For example, by moving logging off the main thread, we can get
up to 15% increase in overall speed because we no longer have the main thread
waiting for disk writes or remote logging posts. Please note, this level of
speed improvement can only be realized if there is no serialization happening
at the multi-threaded queue.
A good amount of time is spent waiting for underlying C libraries and OS services to respond to network and file access requests. Multiple threads can make your code faster despite the GIL when dealing with those requests. For example, by moving logging off the main thread, we can get up to 15% increase in overall speed because we no longer have the main thread waiting for disk writes or . Please note, this level of speed improvement can only be realized if there is no serialization happening at the multi-threaded queue.
###Asynch vs. Actors###
My personal belief is that [actors](http://en.wikipedia.org/wiki/Actor_model)
are easier to reason about than [asynch tasks](https://docs.python.org/3/library/asyncio-task.html).
Mixing regular methods and co-routines (with their `yield from` pollution) is
dangerous because:
1) calling styles between methods and co-routines can be easily confused
2) actors can use methods, co-routines can not
3) there is no way to manage resource priority with co-routines.
4) stack traces are lost with co-routines
My personal belief is that [actors](http://en.wikipedia.org/wiki/Actor_model) are easier to reason about, and
Synchronization Primitives
--------------------------
@ -45,39 +31,10 @@ There are three major aspects of a synchronization primitive:
* **Resource** - Monitors and locks can only be owned by one thread at a time
* **Binary** - The primitive has only two states
* **Irreversible** - The state of the primitive can only be set, or advanced, never reversed
* **Reversible** - The state of the primitive can be set, or advanced, and reversed again
The last, *irreversibility* is very useful, but ignored in many threading
libraries. The irreversibility allows us to model progression; and
we can allow threads to poll for progress, or be notified of progress.
These three aspects can be combined to give us 8 synchronization primitives:
* `- - -` - Semaphore
* `- B -` - Binary Semaphore
* `R - -` - Monitor
* `R B -` - Lock
* `- - I` - Progress
* `- B I` - Signal
* `R - I` - ?limited usefulness?
* `R B I` - ?limited usefulness?
The last, *reversibility* is very useful, but ignored in many threading libraries. The lack of reversibility allows us to model progression; and we can allow threads to poll for progress, or be notified of progress.
###Class `Signal`###
An irreversible binary semaphore used to signal state progression.
**Method `wait_for_go(self, timeout=None, till=None)`**
Put a thread into a waiting state until the signal is activated
**Method `go(self)`**
Activate the signal. Does nothing if already activated.
**Method `is_go(self)`**
Test if the signal is activated, do not wait`
**Method `on_go(self, target)`**
Run the `target` method when the signal is activated. The activating thread will be running the target method, so be sure you are not accessing resources.

Просмотреть файл

@ -10,102 +10,71 @@ from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
import subprocess
from pyLibrary.debugs.logs import Log
from pyLibrary.thread.threads import Queue
# YOU ARE READING AN INCOMPLETE IMPLEMENTATION
class worker(object):
def __init__(func, inbound, outbound, logging):
logger = Log_usingInterProcessQueue(logging)
from pyLibrary.thread.threads import Queue, Thread, Signal
DEBUG=True
class Log_usingInterProcessQueue(Log):
def __init__(self, outbound):
self.outbound = outbound
class Process(object):
def write(self, template, params):
self.outbound.put({"template": template, "param": params})
def __init__(self, name, params, cwd=None):
self.name = name
self.service_stopped = Signal()
self.send = Queue("send")
self.recieve = Queue("recieve")
class Multiprocess(object):
# THE COMPLICATION HERE IS CONNECTING THE DISPARATE LOGGING TO
# A CENTRAL POINT
# ONLY THE MAIN THREAD CAN CREATE AND COMMUNICATE WITH multiprocess.Process
def __init__(self, functions):
self.outbound = Queue("out to process")
self.inbound = Queue("in from stdin")
self.inbound = Queue("in from stderr")
# MAKE
# MAKE THREADS
self.threads = []
for t, f in enumerate(functions):
thread = worker(
"worker " + unicode(t),
f,
self.inbound,
self.outbound,
try:
self.service = service = subprocess.Popen(
params,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=-1,
cwd=cwd
)
self.threads.append(thread)
def __enter__(self):
return self
# WAIT FOR ALL QUEUED WORK TO BE DONE BEFORE RETURNING
def __exit__(self, a, b, c):
try:
self.inbound.close() # SEND STOPS TO WAKE UP THE WORKERS WAITING ON inbound.pop()
self.stopper = Signal()
self.stopper.on_go(lambda: service.kill())
Thread.run(self.name+" waiter", waiter, self)
Thread.run(self.name+" stdout", reader, service.stdout, self.recieve, please_stop=self.stopper)
Thread.run(self.name+" stderr", reader, service.stderr, self.recieve, please_stop=self.stopper)
Thread.run(self.name+" stdin", writer, service.stdin, self.recieve, please_stop=self.stopper)
except Exception, e:
Log.warning("Problem adding to inbound", e)
Log.error("Can not call", e)
self.join()
# IF YOU SENT A stop(), OR STOP, YOU MAY WAIT FOR SHUTDOWN
def join(self):
try:
# WAIT FOR FINISH
for t in self.threads:
t.join()
except (KeyboardInterrupt, SystemExit):
Log.note("Shutdow Started, please be patient")
except Exception, e:
Log.error("Unusual shutdown!", e)
finally:
for t in self.threads:
t.keep_running = False
for t in self.threads:
t.join()
self.inbound.close()
self.outbound.close()
# RETURN A GENERATOR THAT HAS len(parameters) RESULTS (ANY ORDER)
def execute(self, parameters):
# FILL QUEUE WITH WORK
self.inbound.extend(parameters)
num = len(parameters)
def output():
for i in xrange(num):
result = self.outbound.pop()
yield result
return output()
# EXTERNAL COMMAND THAT RETURNS IMMEDIATELY
def stop(self):
self.inbound.close() # SEND STOPS TO WAKE UP THE WORKERS WAITING ON inbound.pop()
for t in self.threads:
t.keep_running = False
self.stopper.go()
self.send.add("exit")
def join(self):
self.service_stopped.wait_for_go()
def waiter(this, please_stop):
this.service.wait()
if DEBUG:
Log.alert("{{name}} stopped", name=this.name)
this.service_stopped.go()
def reader(stdout, recieve, please_stop):
while not please_stop:
line = stdout.readline()
if line:
recieve.add(line)
Log.note("FROM PROCESS: {{line}}", line=line.rstrip())
else:
Thread.sleep(1)
stdout.close()
def writer(stdin, send, please_stop):
while not please_stop:
line = send.pop()
if line:
stdin.write(line+"\n")
stdin.close()

Просмотреть файл

@ -29,17 +29,17 @@ from pyLibrary.times.dates import Date
from pyLibrary.times.durations import SECOND
_Log = None
Log = None
DEBUG = True
MAX_DATETIME = datetime(2286, 11, 20, 17, 46, 39)
def _late_import():
global _Log
global Log
from pyLibrary.debugs.logs import Log as _Log
from pyLibrary.debugs.logs import Log
_ = _Log
_ = Log
class Lock(object):
@ -67,7 +67,7 @@ class Lock(object):
def wait(self, timeout=None, till=None):
if till:
timeout = (datetime.utcnow() - till).total_seconds()
timeout = (till - Date.now()).seconds
if timeout < 0:
return
self.monitor.wait(timeout=float(timeout) if timeout else None)
@ -94,7 +94,6 @@ class Queue(object):
self.lock = Lock("lock for queue " + name)
self.queue = deque()
self.next_warning = datetime.utcnow() # FOR DEBUGGING
self.gc_count = 0
def __iter__(self):
while self.keep_running:
@ -103,9 +102,9 @@ class Queue(object):
if value is not Thread.STOP:
yield value
except Exception, e:
_Log.warning("Tell me about what happened here", e)
Log.warning("Tell me about what happened here", e)
_Log.note("queue iterator is done")
Log.note("queue iterator is done")
def add(self, value):
@ -115,6 +114,18 @@ class Queue(object):
self.queue.append(value)
return self
def push(self, value):
"""
SNEAK value TO FRONT OF THE QUEUE
"""
with self.lock:
self._wait_for_queue_space()
if self.keep_running:
self.queue.appendleft(value)
return self
def extend(self, values):
with self.lock:
# ONCE THE queue IS BELOW LIMIT, ALLOW ADDING MORE
@ -142,7 +153,7 @@ class Queue(object):
now = datetime.utcnow()
if self.next_warning < now:
self.next_warning = now + timedelta(seconds=wait_time)
_Log.alert("Queue {{name}} is full ({{num}} items), thread(s) have been waiting {{wait_time}} sec",
Log.alert("Queue {{name}} is full ({{num}} items), thread(s) have been waiting {{wait_time}} sec",
name=self.name,
num=len(self.queue),
wait_time=wait_time
@ -156,20 +167,21 @@ class Queue(object):
with self.lock:
return any(r != Thread.STOP for r in self.queue)
def pop(self, till=None):
def pop(self, till=None, timeout=None):
"""
WAIT FOR NEXT ITEM ON THE QUEUE
RETURN Thread.STOP IF QUEUE IS CLOSED
IF till IS PROVIDED, THEN pop() CAN TIMEOUT AND RETURN None
"""
if timeout:
till = Date.now() + timeout
with self.lock:
if till == None:
if not till:
while self.keep_running:
if self.queue:
value = self.queue.popleft()
self.gc_count += 1
if self.gc_count % 1000 == 0:
gc.collect()
if value is Thread.STOP: # SENDING A STOP INTO THE QUEUE IS ALSO AN OPTION
self.keep_running = False
return value
@ -195,7 +207,7 @@ class Queue(object):
if self.keep_running:
return None
_Log.note("queue stopped")
Log.note("queue stopped")
return Thread.STOP
@ -217,6 +229,21 @@ class Queue(object):
self.queue.clear()
return output
def pop_one(self):
"""
NON-BLOCKING POP IN QUEUE, IF ANY
"""
with self.lock:
if not self.keep_running:
return [Thread.STOP]
elif not self.queue:
return None
else:
v =self.queue.pop()
if v is Thread.STOP: # SENDING A STOP INTO THE QUEUE IS ALSO AN OPTION
self.keep_running = False
return v
def close(self):
with self.lock:
self.keep_running = False
@ -237,7 +264,7 @@ class AllThread(object):
"""
def __init__(self):
if not _Log:
if not Log:
_late_import()
self.threads = []
@ -256,10 +283,10 @@ class AllThread(object):
if "exception" in response:
exceptions.append(response["exception"])
except Exception, e:
_Log.warning("Problem joining", e)
Log.warning("Problem joining", e)
if exceptions:
_Log.error("Problem in child threads", exceptions)
Log.error("Problem in child threads", exceptions)
def add(self, target, *args, **kwargs):
@ -292,7 +319,7 @@ class MainThread(object):
children = copy(self.children)
for c in reversed(children):
if c.name:
_Log.note("Stopping thread {{name|quote}}", name=c.name)
Log.note("Stopping thread {{name|quote}}", name=c.name)
c.stop()
for c in children:
c.join()
@ -317,7 +344,7 @@ class Thread(object):
def __init__(self, name, target, *args, **kwargs):
if not _Log:
if not Log:
_late_import()
self.id = -1
self.name = name
@ -357,14 +384,14 @@ class Thread(object):
self.kwargs = None
def start(self):
if not _Log:
if not Log:
_late_import()
try:
self.thread = thread.start_new_thread(Thread._run, (self, ))
return self
except Exception, e:
_Log.error("Can not start thread", e)
Log.error("Can not start thread", e)
def stop(self):
for c in copy(self.children):
@ -378,7 +405,7 @@ class Thread(object):
self.children.remove(child)
def _run(self):
if _Log.cprofiler:
if Log.cprofiler:
import cProfile
self.cprofiler = cProfile.Profile()
@ -398,7 +425,7 @@ class Thread(object):
with self.synch_lock:
self.response = Dict(exception=e)
try:
_Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
except Exception, f:
sys.stderr.write("ERROR in thread: " + str(self.name) + " " + str(e) + "\n")
finally:
@ -420,7 +447,7 @@ class Thread(object):
import pstats
self.cprofiler.disable()
_Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
Log.cprofiler_stats.add(pstats.Stats(self.cprofiler))
del self.cprofiler
def is_alive(self):
@ -435,7 +462,7 @@ class Thread(object):
if till is None:
till = datetime.utcnow() + timedelta(seconds=timeout)
else:
_Log.error("Can not except both `timeout` and `till`")
Log.error("Can not except both `timeout` and `till`")
children = copy(self.children)
for c in children:
@ -451,7 +478,7 @@ class Thread(object):
self.synch_lock.wait(0.5)
if DEBUG:
_Log.note("Waiting on thread {{thread|json}}", thread=self.name)
Log.note("Waiting on thread {{thread|json}}", thread=self.name)
else:
self.stopped.wait_for_go(till=till)
if self.stopped:
@ -464,12 +491,12 @@ class Thread(object):
@staticmethod
def run(name, target, *args, **kwargs):
if not _Log:
if not Log:
_late_import()
# ENSURE target HAS please_stop ARGUMENT
if "please_stop" not in target.__code__.co_varnames:
_Log.error("function must have please_stop argument for signalling emergency shutdown")
Log.error("function must have please_stop argument for signalling emergency shutdown")
Thread.num_threads += 1
@ -478,7 +505,7 @@ class Thread(object):
return output
@staticmethod
def sleep(seconds=None, till=None, please_stop=None):
def sleep(seconds=None, till=None, timeout=None, please_stop=None):
if please_stop is not None or isinstance(till, Signal):
if isinstance(till, Signal):
@ -487,6 +514,8 @@ class Thread(object):
if seconds is not None:
till = datetime.utcnow() + timedelta(seconds=seconds)
elif timeout is not None:
till = datetime.utcnow() + timedelta(seconds=timeout.seconds)
elif till is None:
till = MAX_DATETIME
@ -528,9 +557,9 @@ class Thread(object):
please_stop.on_go(lambda: MAIN_THREAD.stop())
if Thread.current() != MAIN_THREAD:
if not _Log:
if not Log:
_late_import()
_Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
Log.error("Only the main thread can sleep forever (waiting for KeyboardInterrupt)")
try:
if allow_exit:
@ -539,7 +568,7 @@ class Thread(object):
_wait_for_interrupt(please_stop)
except (KeyboardInterrupt, SystemExit), _:
please_stop.go()
_Log.alert("SIGINT Detected! Stopping...")
Log.alert("SIGINT Detected! Stopping...")
MAIN_THREAD.stop()
@ -607,7 +636,7 @@ class Signal(object):
try:
j()
except Exception, e:
_Log.warning("Trigger on Signal.go() failed!", e)
Log.warning("Trigger on Signal.go() failed!", e)
def is_go(self):
"""
@ -642,7 +671,7 @@ class ThreadedQueue(Queue):
period=None, # MAX TIME BETWEEN FLUSHES TO SLOWER QUEUE
silent=False # WRITES WILL COMPLAIN IF THEY ARE WAITING TOO LONG
):
if not _Log:
if not Log:
_late_import()
batch_size = coalesce(batch_size, int(coalesce(max_size, 0) / 2), 900)
@ -688,7 +717,7 @@ class ThreadedQueue(Queue):
_buffer.append(item)
except Exception, e:
_Log.warning(
Log.warning(
"Unexpected problem",
name=name,
cause=e
@ -706,7 +735,7 @@ class ThreadedQueue(Queue):
next_time = now + bit_more_time
except Exception, e:
_Log.warning(
Log.warning(
"Problem with {{name}} pushing {{num}} items to data sink",
name=name,
num=len(_buffer),
@ -717,7 +746,7 @@ class ThreadedQueue(Queue):
# ONE LAST PUSH, DO NOT HAVE TIME TO DEAL WITH ERRORS
queue.extend(_buffer)
self.thread = Thread.run("threaded queue for " + name, worker_bee, parent_thread=self)
self.thread = Thread.run("threaded queue for " + name, worker_bee)
def add(self, value):
with self.lock:
@ -776,15 +805,46 @@ def _wait_for_exit(please_stop):
cr_count = -1000000 # NOT /dev/null
if strings.strip(line) == "exit":
_Log.alert("'exit' Detected! Stopping...")
Log.alert("'exit' Detected! Stopping...")
return
def _wait_for_interrupt(please_stop):
while not please_stop:
if DEBUG:
_Log.note("inside wait-for-shutdown loop")
Log.note("inside wait-for-shutdown loop")
try:
Thread.sleep(please_stop=please_stop)
except Exception, _:
pass
class Till(Signal):
"""
MANAGE THE TIMEOUT LOGIC
"""
def __init__(self, till=None, timeout=None, seconds=None):
Signal.__init__(self)
timers = []
def go():
self.go()
for t in timers:
t.cancel()
if isinstance(till, Date):
t = threading.Timer((till - Date.now()).seconds, go)
t.start()
timers.append(t)
if timeout:
t = threading.Timer(timeout.seconds, go)
t.start()
timers.append(t)
if seconds:
t = threading.Timer(seconds, go)
t.start()
timers.append(t)
if isinstance(till, Signal):
till.on_go(go)

Просмотреть файл

@ -12,6 +12,7 @@ from __future__ import division
from __future__ import absolute_import
import datetime
from decimal import Decimal
from pyLibrary import regex
from pyLibrary.vendor.dateutil.relativedelta import relativedelta
@ -20,14 +21,12 @@ from pyLibrary.maths import Math
from pyLibrary.dot import wrap
_Date = None
_Log = None
Date = None
Log = None
def _delayed_import():
global _Date
from pyLibrary.times.dates import Date as _Date
_ = _Date(None)
global Date
from pyLibrary.times.dates import Date
_ = Date(None)
class Duration(object):
@ -71,7 +70,7 @@ class Duration(object):
@staticmethod
def range(start, stop, step):
if not step:
_Log.error("Expecting a non-zero duration for interval")
Log.error("Expecting a non-zero duration for interval")
output = []
c = start
while c < stop:
@ -88,12 +87,12 @@ class Duration(object):
return output
def __radd__(self, other):
if not _Date:
if not Date:
_delayed_import()
if isinstance(other, datetime.datetime):
return _Date(other).add(self)
elif isinstance(other, _Date):
return Date(other).add(self)
elif isinstance(other, Date):
return other.add(self)
return self + other
@ -212,10 +211,10 @@ class Duration(object):
@property
def seconds(self):
return self.milli / 1000.0
return float(self.milli) / 1000.0
def total_seconds(self):
return self.milli / 1000.0
return float(self.milli) / 1000.0
def __str__(self):
return str(self.__unicode__())

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,120 +0,0 @@
{
"production_es": {
"description": "pointer to es with known good results",
"host": "http://elasticsearch7.metrics.scl3.mozilla.com",
"port": "9200",
"index": "bugs",
"type": "bug_version",
"debug": true
},
"public_bugs_reference": {
"description": "pointer to es with known good *public* results",
"filename": "./tests/resources/public_bugs_reference_es.json"
},
"public_comments_reference": {
"description": "pointer to es with known good public comments",
"filename": "./tests/resources/public_comments_reference_es.json"
},
"private_bugs_reference": {
"description": "pointer to es with known good results",
"filename": "./tests/resources/private_bugs_reference_es.json"
},
"private_comments_reference": {
"description": "pointer to es with known good private comments",
"filename": "./tests/resources/private_comments_reference_es.json"
},
"candidate": {
"description": "pointer to es with test results",
"filename": "./tests/results/test_results.json",
"host": "http://localhost",
"port": "9200",
"index": "test_bugs",
"type": "bug_version"
},
"fake":{
//FOR TESTING JSON CREATION, NO NEED FOR REAL ES
"bugs": {
"filename":"./tests/results/test_bugs.json"
},
"comments": {
"filename":"./tests/results/test_comments.json"
}
},
"real":{
//FOR TESTING INCREMENTAL ETL (AND GENERAL INTERACTION WITH A REAL ES)
"bugs": {
"host": "http://localhost",
"port": "9200",
"index": "test_bugs",
"type": "bug_version",
"schema_file": "./resources/json/bug_version.json",
"debug": true
},
"comments": {
"host": "http://localhost",
"port": "9200",
"index": "test_comments",
"type": "bug_version",
"schema_file": "./resources/json/bug_comments.json",
"debug": true
}
},
"param": {
"increment": 10000,
"bugs": [ 384, 1108, 1045, 1046, 1157, 1877, 1865, 1869,
2586, 3140, 6810, 9622, 10575, 11040, 12911, 67742,
96421, 123203, 178960, 367518, 457765, 458397, 471427, 544327,
547727, 643420, 692436, 726635, 813650
// ADD 372836 (REVIEW FLAGS TEST)
// 13534 (REVIEW MOVES TO OTHER PERSON)
// 393845 added blocking1.9+ twice
// 671185 *many* review requests
// 937428 whitespace after comma in user story, complex diff
// 248970 another cutoff review request
],
"alias_increment": 1000000,
"alias_file": {
"path": "./resources/json/bugzilla_aliases.json"
},
"temp_dir": "./tests/resources",
"errors": "./tests/results/errors",
"allow_private_bugs": true,
"last_run_time": "./tests/results/last_run_time.txt",
"first_run_time": "./tests/results/first_run_time.txt"
},
"bugzilla": {
"filename": "./tests/resources/sql/small_bugzilla.sql",
"preamble": "from https://github.com/klahnakoski/Bugzilla-ETL",
"host": "localhost",
"port": 3306,
"username": "user",
"password": "password",
"schema": "test_bugzilla",
"expires_on": 1372867005000,
"debug": false
},
"debug": {
"profile": false,
"trace": false,
"log": [
{
"class": "logging.handlers.RotatingFileHandler",
"filename": "./tests/results/logs/test_etl.log",
"maxBytes": 10000000,
"backupCount": 200,
"encoding": "utf8"
},
{
"log_type": "stream",
"stream": "sys.stdout"
},
{
"log_type": "elasticsearch",
"host": "http://klahnakoski-es.corp.tor1.mozilla.com",
"index": "debug",
"type": "bz_etl"
}
]
}
}

Просмотреть файл

@ -32,7 +32,7 @@ def get_resources(source, destination):
setup(
name='Bugzilla-ETL',
version="0.3.13353",
version="2.0.13353",
description='Mozilla Bugzilla Bug Version ETL',
long_description=long_desc,
author='Kyle Lahnakoski',

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -6,12 +6,12 @@ from pymysql.times import TimeDelta
from bzETL.extract_bugzilla import SCREENED_WHITEBOARD_BUG_GROUPS
from pyLibrary.env import startup, elasticsearch
from pyLibrary import struct
from pyLibrary.cnv import CNV
from pyLibrary import convert
from pyLibrary.env.emailer import Emailer
from pyLibrary.env.logs import Log, extract_stack
from pyLibrary.maths import Math
from pyLibrary.queries import Q
from pyLibrary.struct import nvl, Struct
from pyLibrary.queries import qb
from pyLibrary.struct import coalesce, Dict
# WRAP Log.error TO SHOW THE SPECIFIC ERROR IN THE LOGFILE
if not hasattr(Log, "old_error"):
@ -26,7 +26,7 @@ if not hasattr(Log, "old_error"):
##ASSIGN AS CLASS METHOD
Log.error=MethodType(new_error, Log)
NOW = CNV.datetime2milli(datetime.utcnow())
NOW = convert.datetime2milli(datetime.utcnow())
A_WHILE_AGO = int(NOW - TimeDelta(minutes=10).total_seconds()*1000)
@ -58,7 +58,7 @@ class TestLookForLeaks(unittest.TestCase):
"facets": {"0": {"statistical": {"field": "bug_id"}}}
}).facets["0"].max
return reversed(list(Q.intervals(0, max_bug_id, self.settings.param.increment)))
return reversed(list(qb.intervals(0, max_bug_id, self.settings.param.increment)))
def test_private_bugs_not_leaking(self):
bad_news = False
@ -103,9 +103,9 @@ class TestLookForLeaks(unittest.TestCase):
Log.note("{{num}} leaks!! {{bugs}}", {
"num": len(leaked_bugs),
"bugs": Q.run({
"bugs": qb.run({
"from":leaked_bugs,
"select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: CNV.datetime2string(CNV.milli2datetime(d.modified_ts))}],
"select":["bug_id", "bug_version_num", {"name":"modified_ts", "value":lambda d: convert.datetime2string(convert.milli2datetime(d.modified_ts))}],
"sort":"bug_id"
})
})
@ -170,7 +170,7 @@ class TestLookForLeaks(unittest.TestCase):
fields=["bug_id", "bug_group", "attachments", "modified_ts"]
)
private_attachments = Q.run({
private_attachments = qb.run({
"from": bugs_w_private_attachments,
"select": "attachments.attach_id",
"where": {"or": [
@ -181,7 +181,7 @@ class TestLookForLeaks(unittest.TestCase):
try:
private_attachments = [int(v) for v in private_attachments]
except Exception, e:
private_attachments = Q.run({
private_attachments = qb.run({
"from": bugs_w_private_attachments,
"select": "attachments.attach_id",
"where": {"or": [
@ -263,29 +263,29 @@ class TestLookForLeaks(unittest.TestCase):
if leaked_whiteboard:
for l in leaked_whiteboard:
l.modified_ts=CNV.datetime2string(CNV.milli2datetime(l.modified_ts))
l.modified_ts=convert.datetime2string(convert.milli2datetime(l.modified_ts))
Log.error("Whiteboard leaking:\n{{leak|indent}}", {"leak": leaked_whiteboard})
def get(es, esfilter, fields=None, limit=None):
query = struct.wrap({
query = wrap({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": esfilter
}},
"from": 0,
"size": nvl(limit, 200000),
"size": coalesce(limit, 200000),
"sort": []
})
if fields:
query.fields=fields
results = es.search(query)
return Q.select(results.hits.hits, "fields")
return qb.select(results.hits.hits, "fields")
else:
results = es.search(query)
return Q.select(results.hits.hits, "_source")
return qb.select(results.hits.hits, "_source")
@ -300,8 +300,8 @@ def milli2datetime(r):
elif isinstance(r, basestring):
return r
elif Math.is_number(r):
if CNV.value2number(r) > 800000000000:
return CNV.datetime2string(CNV.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
if convert.value2number(r) > 800000000000:
return convert.datetime2string(convert.milli2datetime(r), "%Y-%m-%d %H:%M:%S")
else:
return r
elif isinstance(r, dict):
@ -320,7 +320,7 @@ def milli2datetime(r):
if not output:
return None
try:
return Q.sort(output)
return qb.sort(output)
except Exception:
return output
else:
@ -339,7 +339,7 @@ def main():
if results.errors or results.failures:
error(results)
except Exception, e:
error(Struct(errors=[e]))
error(Dict(errors=[e]))
finally:
pass

Просмотреть файл

@ -1,13 +1,13 @@
# encoding: utf-8
#
from pyLibrary.sql.db import DB, SQL
from pyLibrary.sql.db import MySQL, SQL
from pyLibrary.env.logs import Log
from pyLibrary.env import startup
def main():
"""
MEANT TO BE RUN JUST ONCE IN DEVELOPMENT TO CONVERT A BIG PUBLIC
DATABASE (8G+) INTO A TINY TESTING DB (FOR ADDING TO REPOSITORY)
DATABASE (8G+) INTO A TINY TESTING MySQL (FOR ADDING TO REPOSITORY)
"""
try:
settings=startup.read_settings()
@ -20,7 +20,7 @@ def main():
Log.note("Scrubbing db of those pesky records.")
Log.note("This is going to take hours ...")
DB.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", {
MySQL.execute_file(settings.bugzilla, "./tests/resources/sql/scrub_db.sql", {
"schema":settings.bugzilla.schema,
"bug_list":SQL(settings.param.bugs)
})

Просмотреть файл

@ -1,19 +1,19 @@
# encoding: utf-8
#
from pyLibrary import struct
from pyLibrary.cnv import CNV
from pyLibrary import convert
from pyLibrary.env.files import File
from pyLibrary.env.logs import Log
from pyLibrary.queries import Q
from pyLibrary.queries import qb
from pyLibrary.env import startup
def main(settings):
file = File(settings.param.alias_file)
aliases = CNV.JSON2object(file.read())
aliases = convert.json2value(file.read())
for v in aliases.values():
v.candidates = CNV.dict2Multiset(v.candidates)
v.candidates = convert.dict2Multiset(v.candidates)
data = [
{
@ -24,7 +24,7 @@ def main(settings):
if d.canonical != None and n != d.canonical
]
sorted = Q.sort(data, "found")
sorted = qb.sort(data, "found")
for s in sorted:
Log.note("{{found}} == {{lost}}", s)
@ -35,11 +35,11 @@ def main(settings):
}
rev_clean = struct.inverse(clean)
Log.note(CNV.object2JSON(rev_clean, pretty=True))
Log.note(convert.value2json(rev_clean, pretty=True))
for k, v in rev_clean.items():
if len(v) > 3:
Log.note(CNV.object2JSON({k: v}, pretty=True))
Log.note(convert.value2json({k: v}, pretty=True))
def start():

Просмотреть файл

@ -24,6 +24,9 @@ WHERE
;
COMMIT;
START TRANSACTION;
DELETE FROM
tracking_flags_bugs
@ -182,6 +185,8 @@ INSERT INTO keep_profiles SELECT watch_user FROM components;
DELETE FROM keep_profiles WHERE id IS NULL;
DELETE FROM profiles WHERE userid NOT IN (SELECT DISTINCT id FROM keep_profiles);
DELETE FROM bug_mentors
DROP TABLE IF EXISTS keep_profiles;
UPDATE profiles SET public_key=NULL;
COMMIT;
@ -387,5 +392,10 @@ DELETE FROM whine_schedules;
DELETE FROM quips;
COMMIT;
START TRANSACTION ;
DELETE FROM
SET foreign_key_checks = 1;

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -10,39 +10,38 @@
from datetime import datetime
import unittest
from bzETL import extract_bugzilla, bz_etl
from bzETL.bz_etl import etl
from bzETL.extract_bugzilla import get_current_time, SCREENED_WHITEBOARD_BUG_GROUPS
from pyLibrary.cnv import CNV
from pyLibrary import convert
from pyLibrary.collections import MIN
from pyLibrary.queries.db_query import esfilter2sqlwhere
from pyLibrary.sql.db import DB, all_db
from pyLibrary.env.logs import Log
from pyLibrary.env.elasticsearch import ElasticSearch
from pyLibrary.debugs import startup, constants
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict, Null, wrap
from pyLibrary.env.files import File
from pyLibrary.queries import Q
from pyLibrary.maths.randoms import Random
from pyLibrary.env import startup
from pyLibrary import struct
from pyLibrary.struct import Struct, Null
from pyLibrary.queries import qb
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
from pyLibrary.sql.mysql import MySQL, all_db
from pyLibrary.testing import elasticsearch
from pyLibrary.thread.threads import ThreadedQueue, Thread
from pyLibrary.times.timer import Timer
from util import compare_es, database
from util import database, compare_es
from util.compare_es import get_all_bug_versions
from util.database import diff
BUG_GROUP_FOR_TESTING = "super secret"
class TestETL(unittest.TestCase):
def setUp(self):
self.settings = startup.read_settings(filename="test_settings.json")
self.settings = startup.read_settings(filename="./tests/resources/config/test_settings.json")
constants.set(self.settings.constants)
Log.start(self.settings.debug)
def tearDown(self):
#CLOSE THE CACHED DB CONNECTIONS
#CLOSE THE CACHED MySQL CONNECTIONS
bz_etl.close_db_connections()
if all_db:
@ -60,13 +59,13 @@ class TestETL(unittest.TestCase):
# settings.param.allow_private_bugs = True
database.make_test_instance(self.settings.bugzilla)
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
reference = elasticsearch.open_test_instance("reference", self.settings.private_bugs_reference)
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, 0)
@ -74,7 +73,7 @@ class TestETL(unittest.TestCase):
param.bug_list = self.settings.param.bugs
param.allow_private_bugs = self.settings.param.allow_private_bugs
with ThreadedQueue(candidate, size=1000) as output:
with ThreadedQueue("etl_queue", candidate, max_size=1000) as output:
etl(db, output, param, please_stop=None)
#COMPARE ALL BUGS
@ -91,18 +90,18 @@ class TestETL(unittest.TestCase):
NUM_TO_TEST = 100
MAX_BUG_ID = 900000
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
candidate = elasticsearch.make_test_instance("candidate", self.settings.candidate)
reference = ElasticSearch(self.settings.private_bugs_reference)
reference = elasticsearch.Index(self.settings.private_bugs_reference)
#GO FASTER BY STORING LOCAL FILE
local_cache = File(self.settings.param.temp_dir + "/private_bugs.json")
if local_cache.exists:
private_bugs = set(CNV.JSON2object(local_cache.read()))
private_bugs = set(convert.json2value(local_cache.read()))
else:
with Timer("get private bugs"):
private_bugs = compare_es.get_private_bugs(reference)
local_cache.write(CNV.object2JSON(private_bugs))
local_cache.write(convert.value2json(private_bugs))
while True:
some_bugs = [b for b in [Random.int(MAX_BUG_ID) for i in range(NUM_TO_TEST)] if b not in private_bugs]
@ -110,8 +109,8 @@ class TestETL(unittest.TestCase):
Log.note("Test with the following bug_ids: {{bugs}}", {"bugs":some_bugs})
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, 0)
param.alias_file = self.settings.param.alias_file
@ -196,7 +195,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
#MARK SOME BUGS PRIVATE
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
for b in private_bugs:
database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)
@ -219,7 +218,7 @@ class TestETL(unittest.TestCase):
bz_etl.main(self.settings, es, es_c)
#MARK SOME STUFF PRIVATE
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
#BUGS
private_bugs = set(Random.sample(self.settings.param.bugs, 3))
Log.note("The private bugs are {{bugs}}", {"bugs": private_bugs})
@ -259,7 +258,7 @@ class TestETL(unittest.TestCase):
#MARK SOME STUFF PUBLIC
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
for b in private_bugs:
database.remove_bug_group(db, b, BUG_GROUP_FOR_TESTING)
@ -276,7 +275,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
#MARK SOME STUFF PRIVATE
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
private_attachments = db.query("""
SELECT
bug_id,
@ -305,7 +304,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
#MARK SOME COMMENTS PRIVATE
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
private_comments = db.query("""
SELECT
bug_id,
@ -341,7 +340,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
#MARK SOME BUGS PRIVATE
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
for b in private_bugs:
database.add_bug_group(db, b, BUG_GROUP_FOR_TESTING)
@ -350,7 +349,7 @@ class TestETL(unittest.TestCase):
bz_etl.main(self.settings, es, es_c)
# MAKE A CHANGE TO THE PRIVATE BUGS
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
for b in private_bugs:
old_bug = db.query("SELECT * FROM bugs WHERE bug_id={{bug_id}}", {"bug_id": b})[0]
new_bug = old_bug.copy()
@ -370,15 +369,15 @@ class TestETL(unittest.TestCase):
"query": {"match_all": {}},
"filter": {"and": [
{"terms": {"bug_id": private_bugs}},
{"range": {"expires_on": {"gte": CNV.datetime2milli(now)}}}
{"range": {"expires_on": {"gte": convert.datetime2milli(now)}}}
]}
}},
"from": 0,
"size": 200000,
"sort": []
})
latest_bugs = Q.select(results.hits.hits, "_source")
latest_bugs_index = Q.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG
latest_bugs = qb.select(results.hits.hits, "_source")
latest_bugs_index = qb.unique_index(latest_bugs, "bug_id") # IF NOT UNIQUE, THEN ETL IS WRONG
for bug_id in private_bugs:
if latest_bugs_index[bug_id] == None:
@ -396,18 +395,18 @@ class TestETL(unittest.TestCase):
def test_incremental_etl_catches_tracking_flags(self):
database.make_test_instance(self.settings.bugzilla)
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
# FLAGS ADDED TO BUG 813650 ON 18/12/2012 2:38:08 AM (PDT), SO START AT SOME LATER TIME
param.start_time = CNV.datetime2milli(CNV.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
param.start_time = convert.datetime2milli(convert.string2datetime("02/01/2013 10:09:15", "%d/%m/%Y %H:%M:%S"))
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
param.alias_file = self.settings.param.alias_file
param.bug_list = struct.wrap([813650])
param.bug_list = wrap([813650])
param.allow_private_bugs = self.settings.param.allow_private_bugs
with ThreadedQueue(es, size=1000) as output:
@ -428,7 +427,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
#MARK BUG AS ONE OF THE SCREENED GROUPS
@ -436,13 +435,13 @@ class TestETL(unittest.TestCase):
db.flush()
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, 0)
param.alias_file = self.settings.param.alias_file
param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
param.bug_list = wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
param.allow_private_bugs = True
with ThreadedQueue(es, size=1000) as output:
@ -460,7 +459,7 @@ class TestETL(unittest.TestCase):
database.make_test_instance(self.settings.bugzilla)
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
#MARK BUG AS ONE OF THE SCREENED GROUPS
@ -470,13 +469,13 @@ class TestETL(unittest.TestCase):
db.flush()
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, 0)
param.alias_file = self.settings.param.alias_file
param.bug_list = struct.wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
param.bug_list = wrap([GOOD_BUG_TO_TEST]) # bug 1046 sees lots of whiteboard, and other field, changes
param.allow_private_bugs = True
with ThreadedQueue(es, size=1000) as output:
@ -491,13 +490,13 @@ class TestETL(unittest.TestCase):
def test_incremental_has_correct_expires_on(self):
# 813650, 726635 BOTH HAVE CHANGES IN 2013
bugs = struct.wrap([813650, 726635])
start_incremental=CNV.datetime2milli(CNV.string2datetime("2013-01-01", "%Y-%m-%d"))
bugs = wrap([813650, 726635])
start_incremental=convert.datetime2milli(convert.string2datetime("2013-01-01", "%Y-%m-%d"))
es = elasticsearch.make_test_instance("candidate", self.settings.candidate)
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
#SETUP FIRST RUN PARAMETERS
param = Struct()
param = Dict()
param.end_time = start_incremental
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
@ -510,8 +509,8 @@ class TestETL(unittest.TestCase):
etl(db, output, param, please_stop=None)
#SETUP INCREMENTAL RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(datetime.utcnow())
param = Dict()
param.end_time = convert.datetime2milli(datetime.utcnow())
param.start_time = start_incremental
param.start_time_str = extract_bugzilla.milli2string(db, param.start_time)
@ -528,7 +527,7 @@ class TestETL(unittest.TestCase):
"query": {"match_all": {}},
"filter": {"and":[
{"term":{"bug_id":b}},
{"range":{"expires_on":{"gte":CNV.datetime2milli(datetime.utcnow())}}}
{"range":{"expires_on":{"gte":convert.datetime2milli(datetime.utcnow())}}}
]}
}},
"from": 0,
@ -564,13 +563,13 @@ def verify_public_bugs(es, private_bugs):
def verify_no_private_attachments(es, private_attachments):
#VERIFY ATTACHMENTS ARE NOT IN OUTPUT
for b in Q.select(private_attachments, "bug_id"):
for b in qb.select(private_attachments, "bug_id"):
versions = compare_es.get_all_bug_versions(es, b)
#WE ASSUME THE ATTACHMENT, IF IT EXISTS, WILL BE SOMEWHERE IN THE BUG IT
#BELONGS TO, IF AT ALL
for v in versions:
for a in v.attachments:
if a.attach_id in Q.select(private_attachments, "attach_id"):
if a.attach_id in qb.select(private_attachments, "attach_id"):
Log.error("Private attachment should not exist")
@ -587,7 +586,7 @@ def verify_no_private_comments(es, private_comments):
"sort": []
})
if Q.select(data.hits.hits, "_source"):
if qb.select(data.hits.hits, "_source"):
Log.error("Expecting no comments")
@ -601,25 +600,25 @@ def compare_both(candidate, reference, settings, some_bugs):
found_errors = False
for bug_id in some_bugs:
try:
versions = Q.sort(
versions = qb.sort(
get_all_bug_versions(candidate, bug_id, datetime.utcnow()),
"modified_ts")
# WE CAN NOT EXPECT candidate TO BE UP TO DATE BECAUSE IT IS USING AN OLD IMAGE
if not versions:
max_time = CNV.milli2datetime(settings.bugzilla.expires_on)
max_time = convert.milli2datetime(settings.bugzilla.expires_on)
else:
max_time = CNV.milli2datetime(versions.last().modified_ts)
max_time = convert.milli2datetime(versions.last().modified_ts)
pre_ref_versions = get_all_bug_versions(reference, bug_id, max_time)
ref_versions = \
Q.sort(
qb.sort(
#ADDED TO FIX OLD PRODUCTION BUG VERSIONS
[compare_es.old2new(x, settings.bugzilla.expires_on) for x in pre_ref_versions],
"modified_ts"
)
can = CNV.object2JSON(versions, pretty=True)
ref = CNV.object2JSON(ref_versions, pretty=True)
can = convert.value2json(versions, pretty=True)
ref = convert.value2json(ref_versions, pretty=True)
if can != ref:
found_errors = True
File(try_dir + unicode(bug_id) + ".txt").write(can)

Просмотреть файл

@ -8,14 +8,14 @@
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
import unittest
from bzETL import extract_bugzilla, bz_etl
from bzETL import bz_etl, extract_bugzilla
from bzETL.bz_etl import etl
from bzETL.extract_bugzilla import get_current_time
from pyLibrary.cnv import CNV
from pyLibrary.sql.db import DB, all_db
from pyLibrary.env.logs import Log
from pyLibrary.env import startup
from pyLibrary.struct import Struct
from pyLibrary import convert
from pyLibrary.debugs import startup
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict
from pyLibrary.sql.mysql import all_db, MySQL
from pyLibrary.testing import elasticsearch
from pyLibrary.thread.threads import ThreadedQueue
@ -31,7 +31,7 @@ class TestOneETL(unittest.TestCase):
def tearDown(self):
#CLOSE THE CACHED DB CONNECTIONS
#CLOSE THE CACHED MySQL CONNECTIONS
bz_etl.close_db_connections()
if all_db:
@ -45,12 +45,12 @@ class TestOneETL(unittest.TestCase):
USE A MYSQL DATABASE TO FILL AN ES INSTANCE (USE Fake_ES() INSTANCES TO KEEP
THIS TEST LOCAL) WITH VERSIONS OF BUGS FROM settings.param.bugs.
"""
with DB(self.settings.bugzilla) as db:
with MySQL(self.settings.bugzilla) as db:
candidate = elasticsearch.make_test_instance("candidate", self.settings.elasticsearch)
#SETUP RUN PARAMETERS
param = Struct()
param.end_time = CNV.datetime2milli(get_current_time(db))
param = Dict()
param.end_time = convert.datetime2milli(get_current_time(db))
param.start_time = 0
param.start_time_str = extract_bugzilla.milli2string(db, 0)
@ -63,7 +63,7 @@ class TestOneETL(unittest.TestCase):
#TODO: INCLUDE OPTION TO USE REAL ES (AND ENSURE REALLY WORKING)
# es_settings=Struct(**{
# es_settings=Dict(**{
# "host": "http://localhost",
# "port": "9200",
# "index": ElasticSearch.proto_name("test_public_bugs"),

Просмотреть файл

@ -9,10 +9,10 @@
#
from bzETL import replicate
from pyLibrary.env import startup
from pyLibrary.cnv import CNV
from pyLibrary.env.elasticsearch import ElasticSearch
from pyLibrary.env.logs import Log
from pyLibrary import convert
from pyLibrary.debugs import startup
from pyLibrary.debugs.logs import Log
from pyLibrary.env import elasticsearch
def test_replication():
@ -20,10 +20,10 @@ def test_replication():
settings=startup.read_settings(filename="replication_settings.json")
Log.start(settings.debug)
source=ElasticSearch(settings.source)
source=elasticsearch.Index(settings.source)
destination=replicate.get_or_create_index(settings["destination"], source)
replicate.replicate(source, destination, [537285], CNV.string2datetime("19900101", "%Y%m%d"))
replicate.replicate(source, destination, [537285], convert.string2datetime("19900101", "%Y%m%d"))
finally:
Log.stop()

Просмотреть файл

@ -10,11 +10,10 @@
from datetime import datetime
from bzETL import transform_bugzilla, parse_bug_history
from pyLibrary import struct
from pyLibrary.struct import nvl
from pyLibrary.cnv import CNV
from pyLibrary import convert
from pyLibrary.dot import coalesce, unwrap
from pyLibrary.maths import Math
from pyLibrary.queries import Q
from pyLibrary.queries import qb
#PULL ALL BUG DOCS FROM ONE ES
@ -22,14 +21,14 @@ from pyLibrary.times.timer import Timer
def get_all_bug_versions(es, bug_id, max_time=None):
max_time = nvl(max_time, datetime.max)
max_time = coalesce(max_time, datetime.max)
data = es.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"and": [
{"term": {"bug_id": bug_id}},
{"range": {"modified_ts": {"lte": CNV.datetime2milli(max_time)}}}
{"range": {"modified_ts": {"lte": convert.datetime2milli(max_time)}}}
]}
}},
"from": 0,
@ -37,7 +36,7 @@ def get_all_bug_versions(es, bug_id, max_time=None):
"sort": []
})
return Q.select(data.hits.hits, "_source")
return qb.select(data.hits.hits, "_source")
def get_private_bugs(es):
@ -63,10 +62,10 @@ def get_private_bugs(es):
output = set([])
for bug in data.hits.hits:
output.add(bug.fields.bug_id)
output |= set(nvl(CNV.value2intlist(bug.fields.blocked), []))
output |= set(nvl(CNV.value2intlist(bug.fields.dependson), []))
output |= set(nvl(CNV.value2intlist(bug.fields.dupe_of), []))
output |= set(nvl(CNV.value2intlist(bug.fields.dupe_by), []))
output |= set(coalesce(convert.value2intlist(bug.fields.blocked), []))
output |= set(coalesce(convert.value2intlist(bug.fields.dependson), []))
output |= set(coalesce(convert.value2intlist(bug.fields.dupe_of), []))
output |= set(coalesce(convert.value2intlist(bug.fields.dupe_by), []))
output.add(551988, 636964)
return output
@ -83,23 +82,23 @@ def old2new(bug, max_date):
else:
bug.everconfirmed = int(bug.everconfirmed)
bug = CNV.JSON2object(CNV.object2JSON(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))
bug = convert.json2value(convert.value2json(bug).replace("bugzilla: other b.m.o issues ", "bugzilla: other b.m.o issues"))
if bug.expires_on > max_date:
bug.expires_on = parse_bug_history.MAX_TIME
if bug.votes != None:
bug.votes = int(bug.votes)
bug.dupe_by = CNV.value2intlist(bug.dupe_by)
bug.dupe_by = convert.value2intlist(bug.dupe_by)
if bug.votes == 0:
del bug["votes"]
# if Math.is_integer(bug.remaining_time) and int(bug.remaining_time) == 0:
# bug.remaining_time = 0
if bug.cf_due_date != None and not Math.is_number(bug.cf_due_date):
bug.cf_due_date = CNV.datetime2milli(
CNV.string2datetime(bug.cf_due_date, "%Y-%m-%d")
bug.cf_due_date = convert.datetime2milli(
convert.string2datetime(bug.cf_due_date, "%Y-%m-%d")
)
bug.changes = CNV.JSON2object(
CNV.object2JSON(Q.sort(bug.changes, "field_name")) \
bug.changes = convert.json2value(
convert.value2json(qb.sort(bug.changes, "field_name")) \
.replace("\"field_value_removed\":", "\"old_value\":") \
.replace("\"field_value\":", "\"new_value\":")
)
@ -113,7 +112,7 @@ def old2new(bug, max_date):
if Math.is_number(bug.cf_last_resolved):
bug.cf_last_resolved = long(bug.cf_last_resolved)
else:
bug.cf_last_resolved = CNV.datetime2milli(CNV.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
bug.cf_last_resolved = convert.datetime2milli(convert.string2datetime(bug.cf_last_resolved, "%Y-%m-%d %H:%M:%S"))
except Exception, e:
pass
@ -123,15 +122,15 @@ def old2new(bug, max_date):
if c.attach_id == '':
c.attach_id = None
else:
c.attach_id = CNV.value2int(c.attach_id)
c.attach_id = convert.value2int(c.attach_id)
bug.attachments = Q.sort(bug.attachments, "attach_id")
bug.attachments = qb.sort(bug.attachments, "attach_id")
for a in bug.attachments:
a.attach_id = CNV.value2int(a.attach_id)
a.attach_id = convert.value2int(a.attach_id)
for k, v in list(a.items()):
if k.endswith("isobsolete") or k.endswith("ispatch") or k.endswith("isprivate"):
struct.unwrap(a)[k] = CNV.value2int(v) # PREVENT dot (.) INTERPRETATION
a[k.split(".")[-1].split("_")[-1]] = CNV.value2int(v)
unwrap(a)[k] = convert.value2int(v) # PREVENT dot (.) INTERPRETATION
a[k.split(".")[-1].split("_")[-1]] = convert.value2int(v)
bug = transform_bugzilla.normalize(bug)
return bug

Просмотреть файл

@ -1,11 +1,11 @@
# encoding: utf-8
#
from bzETL.extract_bugzilla import milli2string, get_current_time
from pyLibrary.cnv import CNV
from pyLibrary.queries.db_query import esfilter2sqlwhere
from pyLibrary.sql.db import DB
from pyLibrary.env.logs import Log
from pyLibrary.struct import Struct
from pyLibrary import convert
from pyLibrary.debugs.logs import Log
from pyLibrary.dot import Dict
from pyLibrary.queries.qb_usingMySQL import esfilter2sqlwhere
from pyLibrary.sql.mysql import MySQL
from pyLibrary.times.timer import Timer
@ -20,13 +20,13 @@ def make_test_instance(db_settings):
Log.note("Make empty {{schema}} schema", {"schema":db_settings.schema})
no_schema=db_settings.copy()
no_schema.schema = None
with DB(no_schema) as db:
with MySQL(no_schema) as db:
db.execute("DROP DATABASE IF EXISTS {{schema}}", {"schema":db.quote_column(db_settings.schema)})
db.execute("CREATE DATABASE {{schema}}", {"schema":db.quote_column(db_settings.schema)})
#FILL SCHEMA
Log.note("Fill {{schema}} schema with data", {"schema":db_settings.schema})
DB.execute_file(db_settings, db_settings.filename)
MySQL.execute_file(filename=db_settings.filename, settings=db_settings)
except Exception, e:
Log.error("Can not setup test database", e)
@ -63,8 +63,8 @@ def add_bug_group(db, bug_id, group_name):
group_id=group_exists[0].id
diff(db, "bugs",
Struct(bug_id=bug_id, bug_group=None),
Struct(bug_id=bug_id, bug_group=group_name)
Dict(bug_id=bug_id, bug_group=None),
Dict(bug_id=bug_id, bug_group=group_name)
)
db.insert("bug_group_map", {"bug_id":bug_id, "group_id":group_id})
@ -73,8 +73,8 @@ def remove_bug_group(db, bug_id, group_name):
group_id=db.query("SELECT id FROM groups WHERE name={{name}}", {"name": group_name})[0].id
diff(db, "bugs",
Struct(bug_id=bug_id, bug_group=group_name),
Struct(bug_id=bug_id, bug_group=None)
Dict(bug_id=bug_id, bug_group=group_name),
Dict(bug_id=bug_id, bug_group=None)
)
db.execute("DELETE FROM bug_group_map WHERE bug_id={{bug_id}} and group_id={{group_id}}", {
"bug_id":bug_id,
@ -88,7 +88,7 @@ def diff(db, table, old_record, new_record):
"""
UPDATE bugs_activity WITH THE CHANGES IN RECORDS
"""
now = milli2string(db, CNV.datetime2milli(get_current_time(db)))
now = milli2string(db, convert.datetime2milli(get_current_time(db)))
changed = set(old_record.keys()) ^ set(new_record.keys())
changed |= set([k for k, v in old_record.items() if v != new_record[k]])
@ -103,7 +103,7 @@ def diff(db, table, old_record, new_record):
if fieldid == None:
Log.error("Expecting a valid field name")
activity = Struct(
activity = Dict(
bug_id=old_record.bug_id,
who=1,
bug_when=now,