From 2923e77408055566b590629c468110a4671aeab0 Mon Sep 17 00:00:00 2001 From: Apoorve Dave <66283785+apoorvedave1@users.noreply.github.com> Date: Fri, 26 Mar 2021 17:39:42 -0700 Subject: [PATCH] add distinct on file ids Co-authored-by: EJ Song <51077614+sezruby@users.noreply.github.com> --- .../com/microsoft/hyperspace/index/rules/PEFilterIndexRule.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/PEFilterIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/PEFilterIndexRule.scala index 5d47ce22..5186a5f3 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/PEFilterIndexRule.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/PEFilterIndexRule.scala @@ -100,6 +100,7 @@ object PEFilterIndexRule .parquet(index.content.files.map(_.toString): _*) .where(condition.sql) .select(IndexConstants.DATA_FILE_NAME_ID) + .distinct val fileIds = filteredDf.rdd.map(r => r(0)).collect.toSet index.fileIdTracker.getFileToIdMap