From dc60b040f517b053d6513299dc179e10e674441b Mon Sep 17 00:00:00 2001 From: Leli <33942105+lelilia@users.noreply.github.com> Date: Thu, 29 Feb 2024 19:24:57 +0100 Subject: [PATCH] DENG-2918-UDFs for URL parsing (#5141) * DENG-2918-UDFs for URL parsing * removing get_host because net.host() exists --- sql/mozfun/utils/get_url_path/README.md | 4 ++++ sql/mozfun/utils/get_url_path/metadata.yaml | 2 ++ sql/mozfun/utils/get_url_path/udf.sql | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 sql/mozfun/utils/get_url_path/README.md create mode 100644 sql/mozfun/utils/get_url_path/metadata.yaml create mode 100644 sql/mozfun/utils/get_url_path/udf.sql diff --git a/sql/mozfun/utils/get_url_path/README.md b/sql/mozfun/utils/get_url_path/README.md new file mode 100644 index 0000000000..e4d300a80a --- /dev/null +++ b/sql/mozfun/utils/get_url_path/README.md @@ -0,0 +1,4 @@ +This UDF extracts path from a URL string. + +The path is everything after the host and before parameters. +This function returns "/" if there is no path. \ No newline at end of file diff --git a/sql/mozfun/utils/get_url_path/metadata.yaml b/sql/mozfun/utils/get_url_path/metadata.yaml new file mode 100644 index 0000000000..ad56bad597 --- /dev/null +++ b/sql/mozfun/utils/get_url_path/metadata.yaml @@ -0,0 +1,2 @@ +description: Extract the Path from a URL +friendly_name: Get URL Path diff --git a/sql/mozfun/utils/get_url_path/udf.sql b/sql/mozfun/utils/get_url_path/udf.sql new file mode 100644 index 0000000000..86a347155b --- /dev/null +++ b/sql/mozfun/utils/get_url_path/udf.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE FUNCTION utils.get_url_path(url STRING) +RETURNS STRING AS ( + "/" || COALESCE( + REGEXP_EXTRACT(REPLACE(REPLACE(url, "https://", ""), "http://", ""), r"\/([^&?#]*)"), + "" + ) +); + +-- Tests +SELECT + mozfun.assert.equals("/path", utils.get_url_path("https://some-url.com/path")), + mozfun.assert.equals("/path", utils.get_url_path("http://some-url.com/path?more")), + mozfun.assert.equals("/path", utils.get_url_path("http://some-url.com/path#more")), + mozfun.assert.equals("/path", utils.get_url_path("http://some-url.com/path?more&utm=123")), + mozfun.assert.equals( + "/path/with/multiple/slashes", + utils.get_url_path("http://some-url.com/path/with/multiple/slashes?more") + ), + mozfun.assert.equals("/", utils.get_url_path("https://some-url.com")), + mozfun.assert.equals("/", utils.get_url_path("https://some-url.com/")), + mozfun.assert.equals("/path", utils.get_url_path("some-url.com/path"))