Data Skipping Index Part 3-2: Rule (#482)

2021-09-01 08:28:01 +09:00 · 2021-09-01 08:28:01 +09:00 · 9735b57be5
--- a/build.sbt
+++ b/build.sbt
@ -127,6 +127,9 @@ ThisBuild / Test / fork := true

 ThisBuild / Test / javaOptions += "-Xmx1024m"

+// Needed to test both non-codegen and codegen parts of expressions
+ThisBuild / Test / envVars += "SPARK_TESTING" -> "1"
+
 ThisBuild / coverageExcludedPackages := "com\\.fasterxml.*;com\\.microsoft\\.hyperspace\\.shim"

 /**
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExprMatcher.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/ExprMatcher.scala
@ -14,15 +14,11 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.shim

-import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.First

-trait ExprMatcher {
-
-  /**
-   * Returns true if the given expression matches the expression this matcher
-   * is associated with.
-   */
-  def apply(e: Expression): Boolean
+object FirstNullSafe {
+  def apply(child: Expression): First = First(child, Literal(false))
 }
--- a/src/main/scala-spark3/com/microsoft/hyperspace/shim/FirstNullSafe.scala
+++ b/src/main/scala-spark3/com/microsoft/hyperspace/shim/FirstNullSafe.scala
@ -0,0 +1,24 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.shim
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.aggregate.First
+
+object FirstNullSafe {
+  def apply(child: Expression): First = First(child, false)
+}
--- a/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala
+++ b/src/main/scala/com/microsoft/hyperspace/Hyperspace.scala
@ -21,7 +21,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
 import com.microsoft.hyperspace.index._
 import com.microsoft.hyperspace.index.IndexConstants.{OPTIMIZE_MODE_QUICK, REFRESH_MODE_FULL}
 import com.microsoft.hyperspace.index.plananalysis.{CandidateIndexAnalyzer, PlanAnalyzer}
-import com.microsoft.hyperspace.index.rules.ApplyHyperspace
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.withHyperspaceRuleDisabled
 import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager

 class Hyperspace(spark: SparkSession) {
@ -189,15 +189,6 @@ class Hyperspace(spark: SparkSession) {
      }
    }
  }
-
-  private def withHyperspaceRuleDisabled(f: => Unit): Unit = {
-    try {
-      ApplyHyperspace.disableForIndexMaintenance.set(true)
-      f
-    } finally {
-      ApplyHyperspace.disableForIndexMaintenance.set(false)
-    }
-  }
 }

 object Hyperspace extends ActiveSparkSession {
--- a/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala
@ -128,4 +128,22 @@ object IndexConstants {
  val DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE =
    "spark.hyperspace.index.dataskipping.targetIndexDataFileSize"
  val DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE_DEFAULT = "268435456" // 256 MiB
+
+  /**
+   * Maximum number of index data files.
+   *
+   * The number of index data files determined by targetIndexFileSize is
+   * capped by this value.
+   */
+  val DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT =
+    "spark.hyperspace.index.dataskipping.maxIndexDataFileCount"
+  val DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT_DEFAULT = "10000"
+
+  /**
+   * If set to true, partition sketches for partition columns are included when
+   * creating data skipping indexes. This does not affect existing indexes.
+   */
+  val DATASKIPPING_AUTO_PARTITION_SKETCH =
+    "spark.hyperspace.index.dataskipping.autoPartitionSketch"
+  val DATASKIPPING_AUTO_PARTITION_SKETCH_DEFAULT = "true"
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntryTags.scala
@ -16,7 +16,8 @@

 package com.microsoft.hyperspace.index

-import org.apache.spark.sql.execution.datasources.InMemoryFileIndex
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.datasources.{FileIndex, InMemoryFileIndex}

 import com.microsoft.hyperspace.index.plananalysis.FilterReason

@ -68,4 +69,17 @@ object IndexLogEntryTags {
  // If it's enabled, FILTER_REASONS and APPLIED_INDEX_RULES info will be tagged.
  val INDEX_PLAN_ANALYSIS_ENABLED: IndexLogEntryTag[Boolean] =
    IndexLogEntryTag[Boolean]("indexPlanAnalysisEnabled")
+
+  // DATASKIPPING_INDEX_DATA_PREDICATE stores the index predicate translated
+  // from the plan's filter or join condition.
+  val DATASKIPPING_INDEX_PREDICATE: IndexLogEntryTag[Option[Expression]] =
+    IndexLogEntryTag[Option[Expression]]("dataskippingIndexPredicate")
+
+  // DATASKIPPING_INDEX_FILEINDEX stores InMemoryFileIndex for the index data.
+  val DATASKIPPING_INDEX_FILEINDEX: IndexLogEntryTag[InMemoryFileIndex] =
+    IndexLogEntryTag[InMemoryFileIndex]("dataskippingIndexRelation")
+
+  // DATASKIPPING_INDEX_FILEINDEX stores InMemoryFileIndex for the source data.
+  val DATASKIPPING_SOURCE_FILEINDEX: IndexLogEntryTag[FileIndex] =
+    IndexLogEntryTag[FileIndex]("dataskippingSourceRelation")
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/IndexUtils.scala
@ -18,6 +18,7 @@ package com.microsoft.hyperspace.index

 import java.net.URLDecoder

+import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.udf

@ -64,4 +65,12 @@ object IndexUtils {
   */
  lazy val decodeInputFileName = udf(
    (p: String) => URLDecoder.decode(p.replace("+", "%2B"), "UTF-8"))
+
+  /**
+   * Returns the path part of the URI-like string.
+   *
+   * This can be used to compare the results of input_file_name() and the paths
+   * stored in FileIdTracker.
+   */
+  lazy val getPath = udf((p: String) => new Path(p).toUri.getPath)
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndex.scala
@ -16,13 +16,20 @@

 package com.microsoft.hyperspace.index.dataskipping

-import org.apache.spark.sql.{Column, DataFrame, SaveMode}
-import org.apache.spark.sql.functions.{input_file_name, min, spark_partition_id}
+import scala.collection.mutable
+
+import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.functions.input_file_name
+import org.apache.spark.sql.hyperspace.utils.StructTypeUtils
+import org.apache.spark.sql.types.StructType

-import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.index._
-import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch
-import com.microsoft.hyperspace.index.dataskipping.util.{DataFrameUtils, ExpressionUtils}
+import com.microsoft.hyperspace.index.dataskipping.expressions._
+import com.microsoft.hyperspace.index.dataskipping.sketches.{PartitionSketch, Sketch}
+import com.microsoft.hyperspace.index.dataskipping.util.DataFrameUtils
 import com.microsoft.hyperspace.util.HyperspaceConf

 /**
@ -30,10 +37,12 @@ import com.microsoft.hyperspace.util.HyperspaceConf
 * files in relations using sketches.
 *
 * @param sketches List of sketches for this index
+ * @param schema Index data schema
 * @param properties Properties for this index; see [[Index.properties]] for details.
 */
 case class DataSkippingIndex(
    sketches: Seq[Sketch],
+    schema: StructType,
    override val properties: Map[String, String] = Map.empty)
    extends Index {
  assert(sketches.nonEmpty, "At least one sketch is required.")
@ -61,7 +70,8 @@ case class DataSkippingIndex(
  }

  override def optimize(ctx: IndexerContext, indexDataFilesToOptimize: Seq[FileInfo]): Unit = {
-    val indexData = ctx.spark.read.parquet(indexDataFilesToOptimize.map(_.name): _*)
+    val indexData =
+      ctx.spark.read.schema(schema).parquet(indexDataFilesToOptimize.map(_.name): _*)
    writeImpl(ctx, indexData, SaveMode.Overwrite)
  }

@ -71,7 +81,10 @@ case class DataSkippingIndex(
      deletedSourceDataFiles: Seq[FileInfo],
      indexContent: Content): (Index, Index.UpdateMode) = {
    if (appendedSourceData.nonEmpty) {
-      writeImpl(ctx, index(ctx, appendedSourceData.get), SaveMode.Overwrite)
+      writeImpl(
+        ctx,
+        DataSkippingIndex.createIndexData(ctx, sketches, appendedSourceData.get),
+        SaveMode.Overwrite)
    }
    if (deletedSourceDataFiles.nonEmpty) {
      val spark = ctx.spark
@ -98,23 +111,188 @@ case class DataSkippingIndex(
  override def refreshFull(
      ctx: IndexerContext,
      sourceData: DataFrame): (DataSkippingIndex, DataFrame) = {
-    val updatedIndex = copy(sketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData))
-    (updatedIndex, updatedIndex.index(ctx, sourceData))
+    val resolvedSketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData)
+    val indexData = DataSkippingIndex.createIndexData(ctx, resolvedSketches, sourceData)
+    val updatedIndex = copy(sketches = resolvedSketches, schema = indexData.schema)
+    (updatedIndex, indexData)
  }

  override def equals(that: Any): Boolean =
    that match {
-      case DataSkippingIndex(thatSketches, _) => sketches.toSet == thatSketches.toSet
+      case DataSkippingIndex(thatSketches, thatSchema, _) =>
+        sketches.toSet == thatSketches.toSet && schema == thatSchema
      case _ => false
    }

  override def hashCode: Int = sketches.map(_.hashCode).sum

+  /**
+   * Translate the given filter/join condition for the source data to a
+   * predicate that can be used to filter out unnecessary source data files
+   * when applied to index data.
+   *
+   * For example, a filter condition "A = 1" can be translated into an index
+   * predicate "Min_A <= 1 && Max_A >= 1" to filter out files which cannot
+   * satisfy the condition for any rows in the file.
+   *
+   * It is assumed that the condition is in negation normal form. If it is not,
+   * then it may fail to translate the condition which would have been possible
+   * otherwise. This is a valid assumption for Spark 2.4 and later.
+   */
+  def translateFilterCondition(
+      spark: SparkSession,
+      condition: Expression,
+      source: LogicalPlan): Option[Expression] = {
+    val resolvedExprs =
+      ExpressionUtils.getResolvedExprs(spark, sketches, source).getOrElse { return None }
+    val predMap = buildPredicateMap(condition, source, resolvedExprs)
+
+    // Create a single index predicate for a single source predicate node,
+    // by combining individual index predicates with And.
+    // True is returned if there are no index predicates for the source predicate node.
+    def toIndexPred(sourcePred: Expression): Expression = {
+      predMap.get(sourcePred).map(_.reduceLeft(And)).getOrElse(Literal.TrueLiteral)
+    }
+
+    // Compose an index predicate visiting the source predicate tree recursively.
+    def composeIndexPred(sourcePred: Expression): Expression =
+      sourcePred match {
+        case and: And => And(toIndexPred(and), and.mapChildren(composeIndexPred))
+        case or: Or => And(toIndexPred(or), or.mapChildren(composeIndexPred))
+        case leaf => toIndexPred(leaf)
+      }
+
+    val indexPredicate = composeIndexPred(condition)
+
+    // Apply constant folding to get the final predicate.
+    // This is a trimmed down version of the BooleanSimplification rule.
+    // It's just enough to determine whether the index is applicable or not.
+    val optimizePredicate: PartialFunction[Expression, Expression] = {
+      case And(Literal.TrueLiteral, right) => right
+      case And(left, Literal.TrueLiteral) => left
+      case Or(Literal.TrueLiteral, _) => Literal.TrueLiteral
+      case Or(_, Literal.TrueLiteral) => Literal.TrueLiteral
+    }
+    val optimizedIndexPredicate = indexPredicate.transformUp(optimizePredicate)
+
+    // Return None if the index predicate is True - meaning no conversion can be done.
+    if (optimizedIndexPredicate == Literal.TrueLiteral) {
+      None
+    } else {
+      Some(optimizedIndexPredicate)
+    }
+  }
+
+  private def writeImpl(ctx: IndexerContext, indexData: DataFrame, writeMode: SaveMode): Unit = {
+    // require instead of assert, as the condition can potentially be broken by
+    // code which is external to dataskipping.
+    require(
+      indexData.schema.sameType(schema),
+      "Schema of the index data doesn't match the index schema: " +
+        s"index data schema = ${indexData.schema.toDDL}, index schema = ${schema.toDDL}")
+    indexData.cache()
+    indexData.count() // force cache
+    val indexDataSize = DataFrameUtils.getSizeInBytes(indexData)
+    val targetIndexDataFileSize = HyperspaceConf.DataSkipping.targetIndexDataFileSize(ctx.spark)
+    val maxIndexDataFileCount = HyperspaceConf.DataSkipping.maxIndexDataFileCount(ctx.spark)
+    val numFiles = {
+      val n = indexDataSize / targetIndexDataFileSize
+      math.min(math.max(1, n), maxIndexDataFileCount).toInt
+    }
+    val repartitionedIndexData = indexData.repartition(numFiles)
+    repartitionedIndexData.write.mode(writeMode).parquet(ctx.indexDataPath.toString)
+    indexData.unpersist()
+  }
+
+  /**
+   * Collects index predicates for each node in the source predicate.
+   */
+  private def buildPredicateMap(
+      predicate: Expression,
+      source: LogicalPlan,
+      resolvedExprs: Map[Sketch, Seq[Expression]])
+      : scala.collection.Map[Expression, Seq[Expression]] = {
+    val predMap = mutable.Map[Expression, mutable.Buffer[Expression]]()
+    val sketchesWithIndex = sketches.zipWithIndex
+    val nameMap = source.output.map(attr => attr.exprId -> attr.name).toMap
+    val attrMap = buildAttrMap(predicate, resolvedExprs, nameMap)
+    val valueExtractor = AttrValueExtractor(attrMap)
+    def updatePredMap(sourcePred: Expression): Unit = {
+      val indexPreds = sketchesWithIndex.flatMap {
+        case (sketch, idx) =>
+          sketch.convertPredicate(
+            sourcePred,
+            resolvedExprs(sketch),
+            aggrNames(idx).map(UnresolvedAttribute.quoted),
+            nameMap,
+            valueExtractor)
+      }
+      if (indexPreds.nonEmpty) {
+        predMap.getOrElseUpdate(sourcePred, mutable.Buffer.empty) ++= indexPreds
+      }
+    }
+    def forEachTerm(p: Expression, f: Expression => Unit): Unit = {
+      f(p)
+      p match {
+        case And(_, _) | Or(_, _) => p.children.foreach(forEachTerm(_, f))
+        case _ =>
+      }
+    }
+    forEachTerm(predicate, updatePredMap)
+    predMap
+  }
+
+  private def buildAttrMap(
+      predicate: Expression,
+      resolvedExprs: Map[Sketch, Seq[Expression]],
+      nameMap: Map[ExprId, String]): Map[Attribute, Expression] = {
+    val partitionSketchIdx = sketches.indexWhere(_.isInstanceOf[PartitionSketch])
+    if (partitionSketchIdx != -1) {
+      val partitionSketch = sketches(partitionSketchIdx)
+      val sketchValues = aggrNames(partitionSketchIdx).map(UnresolvedAttribute.quoted)
+      val exprExtractors = resolvedExprs(partitionSketch).map(NormalizedExprExtractor(_, nameMap))
+      val exprsAndValues = exprExtractors.zip(sketchValues)
+      predicate.references
+        .flatMap(a => exprsAndValues.find(_._1.unapply(a).isDefined).map(a -> _._2))
+        .toMap
+    } else {
+      Map.empty
+    }
+  }
+
+  private def aggrNames(i: Int): Seq[String] = {
+    aggregateFunctions
+      .slice(sketchOffsets(i), sketchOffsets(i + 1))
+      .map(_.expr.asInstanceOf[NamedExpression].name)
+  }
+
+  /**
+   * Sketch offsets are used to map each sketch to its corresponding columns
+   * in the dataframe.
+   */
+  @transient
+  private lazy val sketchOffsets: Seq[Int] =
+    sketches.map(_.aggregateFunctions.length).scanLeft(0)(_ + _)
+
+  @transient
+  private lazy val aggregateFunctions = DataSkippingIndex.getNamedAggregateFunctions(sketches)
+}
+
+object DataSkippingIndex {
+  // $COVERAGE-OFF$ https://github.com/scoverage/scalac-scoverage-plugin/issues/125
+  final val kind = "DataSkippingIndex"
+  final val kindAbbr = "DS"
+  // $COVERAGE-ON$
+
  /**
   * Creates index data for the given source data.
   */
-  def index(ctx: IndexerContext, sourceData: DataFrame): DataFrame = {
+  def createIndexData(
+      ctx: IndexerContext,
+      sketches: Seq[Sketch],
+      sourceData: DataFrame): DataFrame = {
    val fileNameCol = "input_file_name"
+    val aggregateFunctions = getNamedAggregateFunctions(sketches)
    val indexDataWithFileName = sourceData
      .groupBy(input_file_name().as(fileNameCol))
      .agg(aggregateFunctions.head, aggregateFunctions.tail: _*)
@ -124,44 +302,21 @@ case class DataSkippingIndex(
    val relation = RelationUtils.getRelation(spark, sourceData.queryExecution.optimizedPlan)
    import spark.implicits._
    val fileIdDf = ctx.fileIdTracker
-      .getIdToFileMapping(relation.pathNormalizer)
+      .getIdToFileMapping()
      .toDF(IndexConstants.DATA_FILE_NAME_ID, fileNameCol)

    indexDataWithFileName
      .join(
        fileIdDf.hint("broadcast"),
-        IndexUtils.decodeInputFileName(indexDataWithFileName(fileNameCol)) ===
-          fileIdDf(fileNameCol))
+        IndexUtils.getPath(IndexUtils.decodeInputFileName(indexDataWithFileName(fileNameCol))) ===
+          IndexUtils.getPath(fileIdDf(fileNameCol)))
      .select(
        IndexConstants.DATA_FILE_NAME_ID,
        indexDataWithFileName.columns.filterNot(_ == fileNameCol).map(c => s"`$c`"): _*)
  }

-  private def writeImpl(ctx: IndexerContext, indexData: DataFrame, writeMode: SaveMode): Unit = {
-    indexData.cache()
-    indexData.count() // force cache
-    val indexDataSize = DataFrameUtils.getSizeInBytes(indexData)
-    val targetIndexDataFileSize = HyperspaceConf.DataSkipping.targetIndexDataFileSize(ctx.spark)
-    val numFiles = indexDataSize / targetIndexDataFileSize
-    if (!numFiles.isValidInt) {
-      throw HyperspaceException(
-        "Could not create index data files due to too many files: " +
-          s"indexDataSize=$indexDataSize, targetIndexDataFileSize=$targetIndexDataFileSize")
-    }
-    val repartitionedIndexData = indexData.repartition(math.max(1, numFiles.toInt))
-    repartitionedIndexData.write.mode(writeMode).parquet(ctx.indexDataPath.toString)
-    indexData.unpersist()
-  }
-
-  /**
-   * Returns a normalized column name valid for a Parquet format.
-   */
-  private def getNormalizeColumnName(name: String): String = {
-    name.replaceAll("[ ,;{}()\n\t=]", "_")
-  }
-
-  @transient
-  private lazy val aggregateFunctions = sketches.flatMap { s =>
+  def getNamedAggregateFunctions(sketches: Seq[Sketch]): Seq[Column] = {
+    sketches.flatMap { s =>
      val aggrs = s.aggregateFunctions
      assert(aggrs.nonEmpty)
      aggrs.zipWithIndex.map {
@ -171,9 +326,10 @@ case class DataSkippingIndex(
    }
  }

-object DataSkippingIndex {
-  // $COVERAGE-OFF$ https://github.com/scoverage/scalac-scoverage-plugin/issues/125
-  final val kind = "DataSkippingIndex"
-  final val kindAbbr = "DS"
-  // $COVERAGE-ON$
+  /**
+   * Returns a normalized column name valid for a Parquet format.
+   */
+  private def getNormalizeColumnName(name: String): String = {
+    name.replaceAll("[ ,;{}()\n\t=]", "_")
+  }
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfig.scala
@ -16,14 +16,13 @@

 package com.microsoft.hyperspace.index.dataskipping

-import scala.collection.mutable
+import org.apache.spark.sql.{DataFrame, SparkSession}

-import org.apache.spark.sql.DataFrame
-
-import com.microsoft.hyperspace.HyperspaceException
+import com.microsoft.hyperspace.{Hyperspace, HyperspaceException}
 import com.microsoft.hyperspace.index.{IndexConfigTrait, IndexerContext}
-import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch
-import com.microsoft.hyperspace.index.dataskipping.util.ExpressionUtils
+import com.microsoft.hyperspace.index.dataskipping.expressions.ExpressionUtils
+import com.microsoft.hyperspace.index.dataskipping.sketches.{PartitionSketch, Sketch}
+import com.microsoft.hyperspace.util.HyperspaceConf

 /**
 * DataSkippingIndexConfig is used to create a [[DataSkippingIndex]] via
@ -59,9 +58,29 @@ case class DataSkippingIndexConfig(
      sourceData: DataFrame,
      properties: Map[String, String]): (DataSkippingIndex, DataFrame) = {
    val resolvedSketches = ExpressionUtils.resolve(ctx.spark, sketches, sourceData)
-    checkDuplicateSketches(resolvedSketches)
-    val index = DataSkippingIndex(resolvedSketches, properties)
-    (index, index.index(ctx, sourceData))
+    val autoPartitionSketch = HyperspaceConf.DataSkipping.autoPartitionSketch(ctx.spark)
+    val partitionSketchOpt =
+      if (autoPartitionSketch) getPartitionSketch(ctx.spark, sourceData)
+      else None
+    val finalSketches = partitionSketchOpt.toSeq ++ resolvedSketches
+    checkDuplicateSketches(finalSketches)
+    val indexData = DataSkippingIndex.createIndexData(ctx, finalSketches, sourceData)
+    val index = DataSkippingIndex(finalSketches, indexData.schema, properties)
+    (index, indexData)
+  }
+
+  private def getPartitionSketch(
+      spark: SparkSession,
+      sourceData: DataFrame): Option[PartitionSketch] = {
+    val relation = Hyperspace
+      .getContext(spark)
+      .sourceProviderManager
+      .getRelation(sourceData.queryExecution.optimizedPlan)
+    if (relation.partitionSchema.nonEmpty) {
+      Some(PartitionSketch(relation.partitionSchema.map(f => (f.name, Some(f.dataType)))))
+    } else {
+      None
+    }
  }

  private def checkDuplicateSketches(sketches: Seq[Sketch]): Unit = {
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndex.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndex.scala
@ -0,0 +1,74 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.execution
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.{Column, DataFrame, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.functions.isnull
+import org.apache.spark.sql.types.StructType
+
+import com.microsoft.hyperspace.index.{FileIdTracker, IndexConstants}
+
+/**
+ * FileIndex implementation that filters out files using the index data and
+ * index predicates generated by applying a data skipping index to the source.
+ */
+class DataSkippingFileIndex(
+    sparkSession: SparkSession,
+    fileIdTracker: FileIdTracker,
+    indexData: DataFrame,
+    private[dataskipping] val indexDataPred: Expression, // exposed for test
+    private[dataskipping] val baseFileIndex: FileIndex)
+    extends FileIndex {
+
+  override def listFiles(
+      partitionFilters: Seq[Expression],
+      dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
+    val partitions = baseFileIndex.listFiles(partitionFilters, dataFilters)
+    import sparkSession.implicits._
+    val pathCol = "__path"
+    val filesWithId = partitions
+      .flatMap(_.files.map(f => (f.getPath.toString, fileIdTracker.addFile(f))))
+      .toDF(pathCol, IndexConstants.DATA_FILE_NAME_ID)
+    val selectedFiles = filesWithId
+      .hint("broadcast")
+      .join(indexData, Seq(IndexConstants.DATA_FILE_NAME_ID), "left")
+      .filter(isnull(indexData(IndexConstants.DATA_FILE_NAME_ID)) || new Column(indexDataPred))
+      .select(pathCol)
+      .collect
+      .map(_.getString(0))
+      .toSet
+    val selectedPartitions = partitions
+      .map(p => p.copy(files = p.files.filter(f => selectedFiles.contains(f.getPath.toString))))
+      .filter(_.files.nonEmpty)
+    selectedPartitions
+  }
+
+  override def rootPaths: Seq[Path] = baseFileIndex.rootPaths
+
+  override def inputFiles: Array[String] = baseFileIndex.inputFiles
+
+  override def refresh(): Unit = baseFileIndex.refresh()
+
+  override def sizeInBytes: Long = baseFileIndex.sizeInBytes
+
+  override def partitionSchema: StructType = baseFileIndex.partitionSchema
+
+  override def metadataOpsTimeNs: Option[Long] = baseFileIndex.metadataOpsTimeNs
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/AttrValueExtractor.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/AttrValueExtractor.scala
@ -0,0 +1,35 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.expressions
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SubqueryExpression}
+
+case class AttrValueExtractor(attrMap: Map[Attribute, Expression]) extends ExpressionExtractor {
+  override def unapply(e: Expression): Option[Expression] = {
+    if (canTransform(e)) Some(transform(e)) else None
+  }
+
+  private def canTransform(e: Expression): Boolean = {
+    e.deterministic &&
+    e.references.forall(attrMap.contains) &&
+    !SubqueryExpression.hasSubquery(e)
+  }
+
+  private def transform(e: Expression): Expression = {
+    e.transform { case a: Attribute => attrMap(a) }
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionExtractor.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionExtractor.scala
@ -0,0 +1,23 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.expressions
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+
+trait ExpressionExtractor {
+  def unapply(e: Expression): Option[Expression]
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtils.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtils.scala
@ -14,26 +14,23 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import java.util.UUID

 import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, LogicalPlan, Project, Window}
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{BooleanType, DataType}

 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.index.IndexUtils
-import com.microsoft.hyperspace.index.dataskipping.sketch.Sketch
+import com.microsoft.hyperspace.index.dataskipping.sketches.Sketch
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.withHyperspaceRuleDisabled

 object ExpressionUtils {

-  val nullExprId = ExprId(0, new UUID(0, 0))
-
  /**
   * Returns copies of the given sketches with the indexed columns replaced by
   * resolved column names and data types.
@ -60,9 +57,8 @@ object ExpressionUtils {
  private def checkExprs(
      exprWithExpectedDataTypes: Seq[(String, Option[DataType])],
      sourceData: DataFrame): Seq[DataType] = {
-    val (exprs, expectedDataTypes) =
-      (exprWithExpectedDataTypes.map(_._1), exprWithExpectedDataTypes.map(_._2))
-    def throwNotSupportedIf(cond: Boolean, msg: => String) = {
+    val exprs = exprWithExpectedDataTypes.map(_._1)
+    def throwNotSupportedIf(cond: Boolean, msg: => String): Unit = {
      if (cond) {
        throw HyperspaceException(s"DataSkippingIndex does not support indexing $msg")
      }
@ -87,7 +83,7 @@ object ExpressionUtils {
          e.find(_.isInstanceOf[SubqueryExpression]).nonEmpty,
          s"an expression which has a subquery: $expr")
        throwNotSupportedIf(
-          e.find(_.isInstanceOf[AttributeReference]).isEmpty,
+          e.find(_.isInstanceOf[Attribute]).isEmpty,
          s"an expression which does not reference source columns: $expr")
        if (expectedDataType.nonEmpty && expectedDataType.get != analyzedExpr.dataType) {
          throw HyperspaceException(
@ -101,14 +97,11 @@ object ExpressionUtils {
  /**
   * Used to workaround the issue where UnresolvedAttribute.sql() doesn't work as expected.
   */
-  private case class QuotedAttribute(name: String) extends LeafExpression {
+  private case class QuotedAttribute(name: String) extends LeafExpression with Unevaluable {
    override def sql: String = name

    // $COVERAGE-OFF$ code never used
    override def nullable: Boolean = throw new NotImplementedError
-    override def eval(input: InternalRow): Any = throw new NotImplementedError
-    override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
-      throw new NotImplementedError
    override def dataType: DataType = throw new NotImplementedError
    // $COVERAGE-ON$
  }
@ -121,12 +114,33 @@ object ExpressionUtils {
   */
  def normalize(expr: Expression): Expression = {
    expr.transformUp {
-      case a: AttributeReference => a.withExprId(nullExprId).withQualifier(Nil)
+      case a: Attribute => a.withExprId(nullExprId).withQualifier(Nil)
      case g @ GetStructField(child, ordinal, _) => g.copy(child, ordinal, None)
+      // Undo HandleNullInputsForUDF and ReplaceNullWithFalseInPredicate so
+      // that we can match scala UDF invocations. HandleNullInputsForUDF rule
+      // transforms ScalaUDF(args...) into If(Or(IsNull(args)...), null,
+      // ScalaUDF(KnownNotNull(args)...)), and ReplaceNullWithFalseInPredicate
+      // rule transforms null into false. ReplaceNullWithFalseInPredicate is
+      // sensitive to the tree shape.
+      // This is a safe operation because we ignore null values when computing
+      // sketch values. For example, MinMaxSketch("f(a)") will only collect
+      // min/max values of non-null results of "f(a)". Then we can use those
+      // sketch values to translate filter conditions like "f(a) = 1". Here,
+      // we are only interested in whether those "f(a)" refers to the same
+      // thing, not how they should be evaluated. Normalized expressions are
+      // only meant to be compared, not evaluated.
+      case If(
+            ExtractIsNullDisjunction(args1),
+            Literal(null | false, dataType1),
+            udf @ ExtractScalaUDF(dataType2, ExtractKnownNotNullArgs(args2)))
+          if args1 == args2 && dataType1 == dataType2 =>
+        udf.copy(children = args2)
    }
  }

-  // Needed because ScalaUDF has a different number of arguments depending on Spark versions.
+  // Exposed for test
+  private[dataskipping] val nullExprId: ExprId = ExprId(0, new UUID(0, 0))
+
  private[dataskipping] object ExtractScalaUDF {
    def unapply(e: ScalaUDF): Option[(DataType, Seq[Expression])] = {
      Some((e.dataType, e.children))
@ -151,4 +165,53 @@ object ExpressionUtils {
      }
    }
  }
+
+  /**
+   * Returns sketch expressions that can be used to match indexed expressions
+   * and expressions in the filter condition. For example, when a user creates
+   * an index with MinMaxSketch("A"), we create an expression corresponding to
+   * "A" here, and later we try to match expression nodes in a filter condition,
+   * say, EqualTo(AttributeReference("A"), Literal(1)), to the expression for
+   * "A".
+   *
+   * We need this step as the filter/join conditions are given to us as a tree
+   * of expressions in the Spark's optimizer, whereas the indexed expressions
+   * are provided and stored as strings.
+   */
+  def getResolvedExprs(
+      spark: SparkSession,
+      sketches: Seq[Sketch],
+      source: LogicalPlan): Option[Map[Sketch, Seq[Expression]]] = {
+    val resolvedExprs = sketches.map { s =>
+      s -> s.expressions.map {
+        case (expr, dataTypeOpt) =>
+          val parsedExpr = spark.sessionState.sqlParser.parseExpression(expr)
+          val dataType = dataTypeOpt.get
+          val filter = dataType match {
+            case BooleanType => Filter(parsedExpr, source)
+            case _ => Filter(PredicateWrapper(parsedExpr), source)
+          }
+          val optimizedFilter = withHyperspaceRuleDisabled {
+            spark.sessionState.optimizer
+              .execute(spark.sessionState.analyzer.execute(filter))
+              .asInstanceOf[Filter]
+          }
+          val resolvedExpr = dataType match {
+            case BooleanType => optimizedFilter.condition
+            case _ => optimizedFilter.condition.asInstanceOf[PredicateWrapper].child
+          }
+          if (resolvedExpr.dataType != dataType) {
+            return None
+          }
+          normalize(resolvedExpr)
+      }
+    }.toMap
+    Some(resolvedExprs)
+  }
+
+  // Used to preserve sketch expressions during optimization
+  private case class PredicateWrapper(override val child: Expression)
+      extends UnaryExpression
+      with Unevaluable
+      with Predicate
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractor.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractor.scala
@ -14,17 +14,15 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExprId}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, ExprId}

-case class NormalizedExprMatcher(expr: Expression, nameMap: Map[ExprId, String])
-    extends ExprMatcher {
-  def apply(e: Expression): Boolean = {
-    val renamed = e.transformUp {
-      case a: AttributeReference => a.withName(nameMap(a.exprId))
-    }
+case class NormalizedExprExtractor(expr: Expression, nameMap: Map[ExprId, String])
+    extends ExpressionExtractor {
+  def unapply(e: Expression): Option[Expression] = {
+    val renamed = e.transformUp { case a: Attribute => a.withName(nameMap(a.exprId)) }
    val normalized = ExpressionUtils.normalize(renamed)
-    expr == normalized
+    if (expr == normalized) Some(expr) else None
  }
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBound.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBound.scala
@ -14,11 +14,11 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression}
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils}
 import org.apache.spark.sql.types.IntegerType
@ -28,12 +28,13 @@ import org.apache.spark.sql.types.IntegerType
 * than (greater than or equal to) the value (right), or null if there is no such
 * element.
 *
+ * If the value (right) is null, null is returned.
+ *
 * Preconditions (unchecked):
 *   - The array must not be null.
 *   - Elements in the array must be in ascending order.
 *   - The array must not contain null elements.
 *   - The array must not contain duplicate elements.
- *   - The value must not be null.
 */
 private[dataskipping] case class SortedArrayLowerBound(left: Expression, right: Expression)
    extends BinaryExpression {
@ -47,6 +48,7 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right:
  override def eval(input: InternalRow): Any = {
    val arr = left.eval(input).asInstanceOf[ArrayData]
    val value = right.eval(input)
+    if (value != null) {
      val dt = right.dataType
      val n = arr.numElements()
      if (n > 0) {
@ -58,6 +60,7 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right:
          return index + 1
        }
      }
+    }
    null
  }

@ -73,6 +76,7 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right:
    val binarySearch = SortedArrayUtils.binarySearchCodeGen(ctx, dt)
    val resultCode =
      s"""
+         |if (!(${rightGen.isNull})) {
         |  int $n = $arr.numElements();
         |  if ($n > 0) {
         |    if (!(${ctx.genGreater(dt, value, firstValueInArr)})) {
@ -83,6 +87,7 @@ private[dataskipping] case class SortedArrayLowerBound(left: Expression, right:
         |      ${ev.value} = $binarySearch($arr, 0, $n, $value).index() + 1;
         |    }
         |  }
+         |}
       """.stripMargin
    ev.copy(code = code"""
      ${leftGen.code}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayUtils.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayUtils.scala
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator}
 import org.apache.spark.sql.catalyst.util.ArrayData
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/extractors.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/expressions/extractors.scala
@ -0,0 +1,123 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.expressions
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.BooleanType
+
+case class EqualToExtractor(left: ExpressionExtractor, right: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Expression)] =
+    p match {
+      case EqualTo(left(l), right(r)) => Some((l, r))
+      case EqualTo(right(r), left(l)) => Some((l, r))
+      case _ => None
+    }
+}
+
+case class EqualNullSafeExtractor(left: ExpressionExtractor, right: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Expression)] =
+    p match {
+      case EqualNullSafe(left(l), right(r)) => Some((l, r))
+      case EqualNullSafe(right(r), left(l)) => Some((l, r))
+      case _ => None
+    }
+}
+
+case class LessThanExtractor(left: ExpressionExtractor, right: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Expression)] =
+    p match {
+      case LessThan(left(l), right(r)) => Some((l, r))
+      case GreaterThan(right(r), left(l)) => Some((l, r))
+      case _ => None
+    }
+}
+
+case class LessThanOrEqualExtractor(left: ExpressionExtractor, right: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Expression)] =
+    p match {
+      case LessThanOrEqual(left(l), right(r)) => Some((l, r))
+      case GreaterThanOrEqual(right(r), left(l)) => Some((l, r))
+      case _ => None
+    }
+}
+
+case class IsNullExtractor(expr: ExpressionExtractor) {
+  def unapply(p: Expression): Option[Expression] =
+    p match {
+      case IsNull(expr(e)) => Some(e)
+      case EqualNullSafe(expr(e), v: Literal) if v.value == null => Some(e)
+      case EqualNullSafe(v: Literal, expr(e)) if v.value == null => Some(e)
+      case _ => None
+    }
+}
+
+case class IsNotNullExtractor(expr: ExpressionExtractor) {
+  def unapply(p: Expression): Option[Expression] =
+    p match {
+      case IsNotNull(expr(e)) => Some(e)
+      // Spark 2.4 lacks a rule transforming Not(IsNull) to IsNotNull
+      case Not(IsNull(expr(e))) => Some(e)
+      case Not(EqualNullSafe(expr(e), v: Literal)) if v.value == null => Some(e)
+      case Not(EqualNullSafe(v: Literal, expr(e))) if v.value == null => Some(e)
+      case _ => None
+    }
+}
+
+case class IsTrueExtractor(expr: ExpressionExtractor) {
+  def unapply(p: Expression): Option[Expression] =
+    p.dataType match {
+      case BooleanType =>
+        p match {
+          case expr(e) => Some(e)
+          case EqualTo(expr(e), Literal(true, BooleanType)) => Some(e)
+          case EqualTo(Literal(true, BooleanType), expr(e)) => Some(e)
+          case _ => None
+        }
+      case _ => None
+    }
+}
+
+case class IsFalseExtractor(expr: ExpressionExtractor) {
+  def unapply(p: Expression): Option[Expression] =
+    p.dataType match {
+      case BooleanType =>
+        p match {
+          case Not(expr(e)) => Some(e)
+          case EqualTo(expr(e), Literal(false, BooleanType)) => Some(e)
+          case EqualTo(Literal(false, BooleanType), expr(e)) => Some(e)
+          case _ => None
+        }
+      case _ => None
+    }
+}
+
+case class InExtractor(arg: ExpressionExtractor, element: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Seq[Expression])] =
+    p match {
+      case In(arg(a), vs) =>
+        Some((a, vs.map(element.unapply(_).getOrElse { return None })))
+      case _ => None
+    }
+}
+
+case class InSetExtractor(arg: ExpressionExtractor) {
+  def unapply(p: Expression): Option[(Expression, Set[Any])] =
+    p match {
+      case InSet(arg(a), vs) => Some((a, vs))
+      case _ => None
+    }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndex.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndex.scala
@ -0,0 +1,105 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame
+import org.apache.spark.sql.types.StructType
+
+import com.microsoft.hyperspace.index.{IndexLogEntry, IndexLogEntryTags}
+import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex
+import com.microsoft.hyperspace.index.dataskipping.execution.DataSkippingFileIndex
+import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation
+import com.microsoft.hyperspace.index.rules._
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToSelectedIndexMap
+
+object ApplyDataSkippingIndex extends HyperspaceRule {
+  protected override val filtersOnQueryPlan: Seq[QueryPlanIndexFilter] =
+    IndexTypeFilter[DataSkippingIndex]() :: FilterPlanNodeFilter :: FilterConditionFilter :: Nil
+
+  protected override val indexRanker: IndexRankFilter = DataSkippingIndexRanker
+
+  override def applyIndex(plan: LogicalPlan, indexes: PlanToSelectedIndexMap): LogicalPlan = {
+    if (indexes.isEmpty) {
+      return plan
+    }
+    plan match {
+      case filter @ Filter(_, ExtractRelation(relation)) =>
+        val indexLogEntry = indexes(relation.plan)
+        val indexDataPred = indexLogEntry
+          .getTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE)
+          .get
+          .getOrElse { return plan }
+        val indexData = logicalPlanToDataFrame(spark, getIndexDataRelation(indexLogEntry))
+        val originalFileIndex = indexLogEntry.withCachedTag(
+          relation.plan,
+          IndexLogEntryTags.DATASKIPPING_SOURCE_FILEINDEX) {
+          relation.getOrCreateFileIndex(spark)
+        }
+        val dataSkippingFileIndex = new DataSkippingFileIndex(
+          spark,
+          indexLogEntry.fileIdTracker,
+          indexData,
+          indexDataPred,
+          originalFileIndex)
+        val newRelation = relation.createLogicalRelation(
+          IndexHadoopFsRelation(
+            relation.createHadoopFsRelation(
+              dataSkippingFileIndex,
+              relation.schema,
+              relation.options),
+            spark,
+            indexLogEntry),
+          relation.output.map(_.asInstanceOf[AttributeReference]))
+        filter.copy(child = newRelation)
+      case _ => plan
+    }
+  }
+
+  override def score(plan: LogicalPlan, indexes: PlanToSelectedIndexMap): Int = {
+    if (indexes.isEmpty) {
+      return 0
+    }
+    // Return the lowest score so that covering indexes take precedence over
+    // data skipping indexes.
+    1
+  }
+
+  private def getIndexDataRelation(indexLogEntry: IndexLogEntry): LogicalRelation = {
+    val indexDataSchema = indexLogEntry.derivedDataset.asInstanceOf[DataSkippingIndex].schema
+    val indexDataLoc =
+      indexLogEntry.withCachedTag(IndexLogEntryTags.DATASKIPPING_INDEX_FILEINDEX) {
+        new InMemoryFileIndex(
+          spark,
+          indexLogEntry.content.files,
+          Map.empty,
+          Some(indexDataSchema),
+          FileStatusCache.getOrCreate(spark))
+      }
+    LogicalRelation(
+      HadoopFsRelation(
+        indexDataLoc,
+        StructType(Nil),
+        indexDataSchema,
+        None,
+        new ParquetFileFormat,
+        Map.empty)(spark))
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/DataSkippingIndexRanker.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/DataSkippingIndexRanker.scala
@ -0,0 +1,38 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.{PlanToIndexesMap, PlanToSelectedIndexMap}
+import com.microsoft.hyperspace.index.rules.IndexRankFilter
+
+object DataSkippingIndexRanker extends IndexRankFilter {
+  override def apply(
+      plan: LogicalPlan,
+      applicableIndexes: PlanToIndexesMap): PlanToSelectedIndexMap = {
+    // TODO: Multiple data skipping index can be applied to the same plan node,
+    // although the effectiveness decreases as more indexes are applied.
+    // The framework should be updated to allow multiple indexes.
+    // For now, simply choose the index with most sketches.
+    applicableIndexes.collect {
+      case (plan, indexes) if indexes.nonEmpty =>
+        plan -> indexes.maxBy(_.derivedDataset.asInstanceOf[DataSkippingIndex].sketches.length)
+    }
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilter.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilter.scala
@ -0,0 +1,63 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+
+import com.microsoft.hyperspace.index.IndexLogEntryTags
+import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex
+import com.microsoft.hyperspace.index.plananalysis.FilterReasons
+import com.microsoft.hyperspace.index.rules.{ExtractRelation, QueryPlanIndexFilter}
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap
+
+/**
+ * FilterConditionFilter filters indexes out if
+ *   1) an index cannot be applied to the filter condition.
+ */
+object FilterConditionFilter extends QueryPlanIndexFilter {
+  override def apply(plan: LogicalPlan, candidateIndexes: PlanToIndexesMap): PlanToIndexesMap = {
+    if (candidateIndexes.isEmpty) {
+      return Map.empty
+    }
+    plan match {
+      case Filter(condition: Expression, ExtractRelation(relation)) =>
+        val applicableIndexes = candidateIndexes(relation.plan).flatMap { indexLogEntry =>
+          val indexDataPredOpt =
+            indexLogEntry.withCachedTag(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE) {
+              val index = indexLogEntry.derivedDataset.asInstanceOf[DataSkippingIndex]
+              index.translateFilterCondition(spark, condition, relation.plan)
+            }
+          if (withFilterReasonTag(
+              plan,
+              indexLogEntry,
+              FilterReasons.IneligibleFilterCondition(condition.sql))(
+              indexDataPredOpt.nonEmpty)) {
+            Some(indexLogEntry)
+          } else {
+            None
+          }
+        }
+        if (applicableIndexes.nonEmpty) {
+          Map(relation.plan -> applicableIndexes)
+        } else {
+          Map.empty
+        }
+      case _ => Map.empty
+    }
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilter.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilter.scala
@ -0,0 +1,40 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan}
+
+import com.microsoft.hyperspace.index.rules.{ExtractRelation, QueryPlanIndexFilter, RuleUtils}
+import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap
+
+/**
+ * FilterPlanNodeFilter filters indexes out if
+ *   1) the given plan is not eligible filter plan node.
+ *   2) the source plan of index is not part of the filter plan.
+ */
+object FilterPlanNodeFilter extends QueryPlanIndexFilter {
+  override def apply(plan: LogicalPlan, candidateIndexes: PlanToIndexesMap): PlanToIndexesMap = {
+    if (candidateIndexes.isEmpty) {
+      return Map.empty
+    }
+    plan match {
+      case Filter(_, ExtractRelation(relation)) if !RuleUtils.isIndexApplied(relation) =>
+        candidateIndexes.filterKeys(relation.plan.equals)
+      case _ => Map.empty
+    }
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketch.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketch.scala
@ -1,43 +0,0 @@
-/*
- * Copyright (2021) The Hyperspace Project Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.microsoft.hyperspace.index.dataskipping.sketch
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.expressions.aggregate.{Max, Min}
-import org.apache.spark.sql.types.DataType
-
-/**
- * Sketch based on minimum and maximum values for a given expression.
- *
- * @param expr Expression from which min/max values are calculated
- * @param dataType Optional data type to specify the expected data type of the
- *         expression. If not specified, it is deduced automatically.
- *         If the actual data type of the expression is different from this,
- *         an error is thrown. Users are recommended to leave this parameter to
- *         None.
- */
-case class MinMaxSketch(override val expr: String, override val dataType: Option[DataType] = None)
-    extends SingleExprSketch[MinMaxSketch](expr, dataType) {
-  override def name: String = "MinMax"
-
-  override def withNewExpression(newExpr: (String, Option[DataType])): MinMaxSketch = {
-    copy(expr = newExpr._1, dataType = newExpr._2)
-  }
-
-  override def aggregateFunctions: Seq[Expression] =
-    Min(parsedExpr).toAggregateExpression() :: Max(parsedExpr).toAggregateExpression() :: Nil
-}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketch.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketch.scala
@ -0,0 +1,100 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.sketches
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.{Max, Min}
+import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils}
+import org.apache.spark.sql.types.{ArrayType, DataType}
+
+import com.microsoft.hyperspace.index.dataskipping.expressions._
+import com.microsoft.hyperspace.index.dataskipping.util.ArrayUtils
+
+/**
+ * Sketch based on minimum and maximum values for a given expression.
+ *
+ * @param expr Expression from which min/max values are calculated
+ * @param dataType Optional data type to specify the expected data type of the
+ *         expression. If not specified, it is deduced automatically.
+ *         If the actual data type of the expression is different from this,
+ *         an error is thrown. Users are recommended to leave this parameter to
+ *         None.
+ */
+case class MinMaxSketch(override val expr: String, override val dataType: Option[DataType] = None)
+    extends SingleExprSketch[MinMaxSketch](expr, dataType) {
+  override def name: String = "MinMax"
+
+  override def withNewExpression(newExpr: (String, Option[DataType])): MinMaxSketch = {
+    copy(expr = newExpr._1, dataType = newExpr._2)
+  }
+
+  override def aggregateFunctions: Seq[Expression] =
+    Min(parsedExpr).toAggregateExpression() :: Max(parsedExpr).toAggregateExpression() :: Nil
+
+  override def convertPredicate(
+      predicate: Expression,
+      resolvedExprs: Seq[Expression],
+      sketchValues: Seq[Expression],
+      nameMap: Map[ExprId, String],
+      valueExtractor: ExpressionExtractor): Option[Expression] = {
+    val min = sketchValues(0)
+    val max = sketchValues(1)
+    // TODO: Add third sketch value "hasNull" of type bool
+    // true if the expr can be null in the file, false if otherwise
+    // to optimize IsNull (can skip files with hasNull = false)
+    // This can be also done as a separate sketch, e.g. HasNullSketch
+    // Should evaluate which way is better
+    val resolvedExpr = resolvedExprs.head
+    val dataType = resolvedExpr.dataType
+    val exprExtractor = NormalizedExprExtractor(resolvedExpr, nameMap)
+    val ExprIsTrue = IsTrueExtractor(exprExtractor)
+    val ExprIsFalse = IsFalseExtractor(exprExtractor)
+    val ExprIsNotNull = IsNotNullExtractor(exprExtractor)
+    val ExprEqualTo = EqualToExtractor(exprExtractor, valueExtractor)
+    val ExprEqualNullSafe = EqualNullSafeExtractor(exprExtractor, valueExtractor)
+    val ExprLessThan = LessThanExtractor(exprExtractor, valueExtractor)
+    val ExprLessThanOrEqualTo = LessThanOrEqualExtractor(exprExtractor, valueExtractor)
+    val ExprGreaterThan = LessThanExtractor(valueExtractor, exprExtractor)
+    val ExprGreaterThanOrEqualTo = LessThanOrEqualExtractor(valueExtractor, exprExtractor)
+    val ExprIn = InExtractor(exprExtractor, valueExtractor)
+    val ExprInSet = InSetExtractor(exprExtractor)
+    Option(predicate)
+      .collect {
+        case ExprIsTrue(_) => max
+        case ExprIsFalse(_) => Not(min)
+        case ExprIsNotNull(_) => IsNotNull(min)
+        case ExprEqualTo(_, v) => And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v))
+        case ExprEqualNullSafe(_, v) =>
+          Or(IsNull(v), And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v)))
+        case ExprLessThan(_, v) => LessThan(min, v)
+        case ExprLessThanOrEqualTo(_, v) => LessThanOrEqual(min, v)
+        case ExprGreaterThan(v, _) => GreaterThan(max, v)
+        case ExprGreaterThanOrEqualTo(v, _) => GreaterThanOrEqual(max, v)
+        case ExprIn(_, vs) =>
+          vs.map(v => And(LessThanOrEqual(min, v), GreaterThanOrEqual(max, v))).reduceLeft(Or)
+        case ExprInSet(_, vs) =>
+          val sortedValues = Literal(
+            ArrayData.toArrayData(
+              ArrayUtils.toArray(
+                vs.filter(_ != null).toArray.sorted(TypeUtils.getInterpretedOrdering(dataType)),
+                dataType)),
+            ArrayType(dataType, containsNull = false))
+          LessThanOrEqual(ElementAt(sortedValues, SortedArrayLowerBound(sortedValues, min)), max)
+        // TODO: StartsWith, Like with constant prefix
+      }
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketch.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketch.scala
@ -0,0 +1,73 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.sketches
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.DataType
+
+import com.microsoft.hyperspace.index.dataskipping.expressions._
+import com.microsoft.hyperspace.shim.FirstNullSafe
+
+/**
+ * Internal implementation for partitioning column.
+ *
+ * This sketch is automatically created for each partitioning column if the
+ * source data has partitioning columns. This enables data skipping indexes to
+ * be usable with filter conditions having a disjunction involving partitioning
+ * columns and indexed columns. For example, a filter condition like "A = 1 or
+ * B = 1" will be translated into an index predicate "(Min_A <= 1 and Max_A >=
+ * 1) or B = 1" where A is an indexed column of MinMaxSketch and B is a
+ * partitioning column.
+ */
+private[dataskipping] case class PartitionSketch(
+    override val expressions: Seq[(String, Option[DataType])])
+    extends Sketch {
+
+  override def indexedColumns: Seq[String] = exprStrings
+
+  override def referencedColumns: Seq[String] = exprStrings
+
+  override def withNewExpressions(
+      newExpressions: Seq[(String, Option[DataType])]): PartitionSketch = {
+    copy(expressions = newExpressions)
+  }
+
+  override def aggregateFunctions: Seq[Expression] = {
+    val parser = SparkSession.getActiveSession.get.sessionState.sqlParser
+    exprStrings.map { e =>
+      FirstNullSafe(parser.parseExpression(e)).toAggregateExpression()
+    }
+  }
+
+  override def toString: String = s"Partition(${exprStrings.mkString(", ")})"
+
+  override def convertPredicate(
+      predicate: Expression,
+      resolvedExprs: Seq[Expression],
+      sketchValues: Seq[Expression],
+      nameMap: Map[ExprId, String],
+      valueExtractor: ExpressionExtractor): Option[Expression] = {
+    predicate match {
+      case And(_, _) | Or(_, _) => None
+      case valueExtractor(v) if (predicate.references.nonEmpty) => Some(v)
+      case _ => None
+    }
+  }
+
+  private def exprStrings: Seq[String] = expressions.map(_._1)
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/SingleExprSketch.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/SingleExprSketch.scala
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.sketch
+package com.microsoft.hyperspace.index.dataskipping.sketches

 import scala.reflect.ClassTag

--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/Sketch.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/sketches/Sketch.scala
@ -14,12 +14,14 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.sketch
+package com.microsoft.hyperspace.index.dataskipping.sketches

 import com.fasterxml.jackson.annotation.JsonTypeInfo
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, ExprId}
 import org.apache.spark.sql.types.DataType

+import com.microsoft.hyperspace.index.dataskipping.expressions.ExpressionExtractor
+
 /**
 * Represents a sketch specification for data skipping indexes.
 *
@ -75,4 +77,42 @@ trait Sketch {
   * Returns the hash code for this sketch.
   */
  def hashCode: Int
+
+  /**
+   * Converts the given predicate node for source data to an index predicate
+   * that can be used to filter out unnecessary source files when applied to
+   * index data.
+   *
+   * The returned predicate should evaluate to true for an index data row
+   * if the corresponding source data file cannot be excluded. The returned
+   * predicate should be used only to find files that cannot be skipped. In
+   * other words, it must not be negated to find files that can be skipped,
+   * because it can evaluate to null.
+   *
+   * The implementation should consider the given predicate as a single node,
+   * not a tree that must be traversed recursively, because that part is
+   * handled by the framework.
+   *
+   * @param predicate Source predicate node
+   * @param resolvedExprs Sketch expressions that can be used to match
+   *          expressions in the source predicate; for example,
+   *          MinMaxSketch("A") will be given an expression corresponding to
+   *          "A".
+   * @param sketchValues Sketch value references in index data; for example,
+   *          MinMaxSketch("A") will be given two expressions corresponding to
+   *          Min(A) and Max(A) in the index data. If the predicate is
+   *          convertible, the implementation should return a predicate
+   *          composed of these sketch values.
+   * @param nameMap Map used to normalize attributes in the source predicate by
+   *          looking up the attribute name with ExprId; this is needed because
+   *          the attribute name in the predicate may have different cases
+   *          (lower/upper cases).
+   * @return Converted predicate for index data
+   */
+  def convertPredicate(
+      predicate: Expression,
+      resolvedExprs: Seq[Expression],
+      sketchValues: Seq[Expression],
+      nameMap: Map[ExprId, String],
+      valueExtractor: ExpressionExtractor): Option[Expression]
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/extractors.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/dataskipping/util/extractors.scala
@ -1,199 +0,0 @@
-/*
- * Copyright (2021) The Hyperspace Project Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.microsoft.hyperspace.index.dataskipping.util
-
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{BooleanType, DataType}
-
-// Value extractors returning Option[Literal] check if the value is not null
-// because we're only interested in non-null values.
-//
-// Also note that we don't go overboard to match every pattern because
-// we assume that Catalyst optimizer will give us an optimized predicate in NNF.
-// It means in general we don't have to deal with Not, or worry about
-// foldable expressions because they will be optimized to literals.
-//
-// There are some differences between Spark versions and that's why we include
-// some patterns that are no longer needed in newer Spark versions.
-
-/**
- * Extracts the non-null literal value in the predicate if it's equivalent to
- * <expr> = <literal>.
- *
- * For the purpose of data skipping, we don't extract the value if it's null.
- * If the literal is not null, then the only way to make the predicate
- * <expr> = <literal> or <expr> <=> <literal> is when the expression is not null
- * and its value is equal to the literal value.
- */
-case class EqualToExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[Literal] =
-    p match {
-      case EqualTo(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case EqualTo(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case EqualNullSafe(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case EqualNullSafe(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case _ => None
-    }
-}
-
-/**
- * Extracts the non-null literal value in the predicate if it's equivalent to
- * <expr> < <literal>.
- */
-case class LessThanExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[Literal] =
-    p match {
-      case LessThan(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case GreaterThan(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case _ => None
-    }
-}
-
-/**
- * Extracts the non-null literal value in the predicate if it's equivalent to
- * <expr> <= <literal>.
- */
-case class LessThanOrEqualToExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[Literal] =
-    p match {
-      case LessThanOrEqual(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case GreaterThanOrEqual(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case _ => None
-    }
-}
-
-/**
- * Extracts the non-null literal value in the predicate if it's equivalent to
- * <expr> > <literal>.
- */
-case class GreaterThanExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[Literal] =
-    p match {
-      case GreaterThan(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case LessThan(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case _ => None
-    }
-}
-
-/**
- * Extracts the non-null literal value in the predicate if it's equivalent to
- * <expr> >= <literal>.
- */
-case class GreaterThanOrEqualToExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[Literal] =
-    p match {
-      case GreaterThanOrEqual(e, v: Literal) if v.value != null && matcher(e) => Some(v)
-      case LessThanOrEqual(v: Literal, e) if v.value != null && matcher(e) => Some(v)
-      case _ => None
-    }
-}
-
-/**
- * Matches the predicate if it's equivalent to <expr> IS NULL.
- */
-case class IsNullExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Boolean =
-    p match {
-      case IsNull(e) if matcher(e) => true
-      case EqualNullSafe(e, v: Literal) if v.value == null && matcher(e) => true
-      case EqualNullSafe(v: Literal, e) if v.value == null && matcher(e) => true
-      case _ => false
-    }
-}
-
-/**
- * Matches the predicate if it's equivalent to <expr> IS NOT NULL.
- *
- * Not(IsNull(<expr>)) is also matched because it can be in the predicate in
- * Spark 2.4. Since Spark 3.0, this is optimized to IsNotNull(<expr>).
- */
-case class IsNotNullExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Boolean =
-    p match {
-      case IsNotNull(e) if matcher(e) => true
-      case Not(IsNull(e)) if matcher(e) => true // for Spark 2.4
-      case Not(EqualNullSafe(e, v: Literal)) if v.value == null && matcher(e) => true
-      case Not(EqualNullSafe(v: Literal, e)) if v.value == null && matcher(e) => true
-      case _ => false
-    }
-}
-
-/**
- * Matches the predicate if it's equivalent to <expr> = true.
- *
- * Note that boolean expressions can be a predicate on their own, not needing
- * EqualTo with true. To avoid false matches, we check that the type of the
- * expression is BooleanType, although it's not strictly necessary because our
- * predicate conversion does not go down the predicate tree unless it's And/Or
- * and Spark has already checked the expression is Boolean if it's a direct
- * child of And/Or.
- */
-case class IsTrueExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Boolean = {
-    val EqualTo = EqualToExtractor(matcher)
-    p.dataType == BooleanType && (p match {
-      case EqualTo(Literal(true, BooleanType)) => true
-      case e if matcher(e) => true
-      case _ => false
-    })
-  }
-}
-
-/**
- * Matches the predicate if it's equivalent to <expr> = false.
- */
-case class IsFalseExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Boolean = {
-    val EqualTo = EqualToExtractor(matcher)
-    p.dataType == BooleanType && (p match {
-      case EqualTo(Literal(false, BooleanType)) => true
-      case Not(e) if matcher(e) => true
-      case _ => false
-    })
-  }
-}
-
-/**
- * Extracts non-null values in the predicate if it's equivalent to
- * <expr> IN (<lit>*).
- *
- * In Spark, In is created if the number of values in the list does not exceed
- * spark.sql.optimizer.inSetConversionThreshold.
- */
-case class InExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[(Seq[Literal])] =
-    p match {
-      case In(e, vs) if vs.nonEmpty && vs.forall(v => v.isInstanceOf[Literal]) && matcher(e) =>
-        Some(vs.map(_.asInstanceOf[Literal]).filter(_.value != null))
-      case _ => None
-    }
-}
-
-/**
- * Extracts non-null values in the predicate if it's equivalent to
- * <expr> IN (<lit>*).
- *
- * InSet is created instead of In if the list size is larger than
- * spark.sql.optimizer.inSetConversionThreshold.
- */
-case class InSetExtractor(matcher: ExprMatcher) {
-  def unapply(p: Expression): Option[(Set[Any])] =
-    p match {
-      case InSet(e, vs) if matcher(e) => Some(vs)
-      case _ => None
-    }
-}
--- a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/FilterReason.scala
@ -148,4 +148,11 @@ object FilterReasons {
    override def verboseStr: String =
      s"Another candidate index is applied: $appliedIndex"
  }
+
+  case class IneligibleFilterCondition(condition: String) extends FilterReason {
+    override final def codeStr: String = "INELIGIBLE_FILTER_CONDITION"
+    override val args = Seq("condition" -> condition)
+    override def verboseStr: String =
+      s"Ineligible filter condition: $condition"
+  }
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/plananalysis/PlanAnalyzer.scala
@ -28,6 +28,7 @@ import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame

 import com.microsoft.hyperspace.{HyperspaceException, Implicits}
 import com.microsoft.hyperspace.index.IndexConstants
+import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation
 import com.microsoft.hyperspace.shim.ExtractFileSourceScanExecRelation

 /**
@ -212,7 +213,13 @@ object PlanAnalyzer {
      plan: SparkPlan,
      indexes: DataFrame,
      bufferStream: BufferStream): Unit = {
-    val usedIndexes = indexes.filter(indexes("indexLocation").isin(getPaths(plan): _*))
+    val usedIndexNames = plan.collect {
+      case ExtractFileSourceScanExecRelation(rel: IndexHadoopFsRelation) =>
+        rel.indexName
+    }
+    val usedIndexes = indexes.filter(
+      indexes("indexLocation").isin(getPaths(plan): _*) ||
+        indexes("name").isin(usedIndexNames: _*))
    usedIndexes.collect().foreach { row =>
      bufferStream
        .write(row.getAs("name").toString)
--- a/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/plans/logical/IndexHadoopFsRelation.scala
@ -45,6 +45,24 @@ class IndexHadoopFsRelation(
    s"Hyperspace(Type: ${index.derivedDataset.kindAbbr}, " +
      s"Name: ${index.name}, LogVersion: ${index.id})"
  }
+
+  def indexName: String = index.name
+
  override def toString(): String = indexPlanStr

 }
+
+object IndexHadoopFsRelation {
+  def apply(
+      rel: HadoopFsRelation,
+      spark: SparkSession,
+      index: IndexLogEntry): IndexHadoopFsRelation = {
+    new IndexHadoopFsRelation(
+      rel.location,
+      rel.partitionSchema,
+      rel.dataSchema,
+      rel.bucketSpec,
+      rel.fileFormat,
+      rel.options)(spark, index)
+  }
+}
--- a/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/rules/ApplyHyperspace.scala
@ -63,4 +63,13 @@ object ApplyHyperspace
      }
    }
  }
+
+  def withHyperspaceRuleDisabled[T](f: => T): T = {
+    try {
+      disableForIndexMaintenance.set(true)
+      f
+    } finally {
+      disableForIndexMaintenance.set(false)
+    }
+  }
 }
--- a/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/rules/ScoreBasedIndexPlanOptimizer.scala
@ -21,13 +21,15 @@ import scala.collection.mutable
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

 import com.microsoft.hyperspace.index.covering.{FilterIndexRule, JoinIndexRule}
+import com.microsoft.hyperspace.index.dataskipping.rules.ApplyDataSkippingIndex
 import com.microsoft.hyperspace.index.rules.ApplyHyperspace.PlanToIndexesMap

 /**
 * Apply Hyperspace indexes based on the score of each index application.
 */
 class ScoreBasedIndexPlanOptimizer {
-  private val rules: Seq[HyperspaceRule] = FilterIndexRule :: JoinIndexRule :: NoOpRule :: Nil
+  private val rules: Seq[HyperspaceRule] =
+    Seq(FilterIndexRule, JoinIndexRule, ApplyDataSkippingIndex, NoOpRule)

  // Map for memoization. The key is the logical plan before applying [[HyperspaceRule]]s
  // and its value is a pair of best transformed plan and its score.
--- a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala
@ -88,6 +88,13 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe
      case _ => None
    }

+  override def getOrCreateFileIndex(spark: SparkSession): FileIndex = {
+    plan.relation match {
+      case HadoopFsRelation(location: FileIndex, _, _, _, _, _) =>
+        location
+    }
+  }
+
  /**
   * Creates [[HadoopFsRelation]] based on the current relation.
   *
--- a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala
@ -18,12 +18,12 @@ package com.microsoft.hyperspace.index.sources.iceberg

 import collection.JavaConverters._
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.iceberg.{FileScanTask, Schema, Table}
+import org.apache.iceberg.{FileScanTask, Schema, Table, TableScan}
 import org.apache.iceberg.spark.SparkSchemaUtil
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.types.StructType

@ -70,7 +70,7 @@ class IcebergRelation(
   * All the files that the current Iceberg table uses for read.
   */
  override lazy val allFiles: Seq[FileStatus] = {
-    table.newScan().planFiles().iterator().asScala.toSeq.map(toFileStatus)
+    fileScanTasks.map(toFileStatus)
  }

  /**
@ -160,6 +160,33 @@ class IcebergRelation(
    SparkSchemaUtil.convert(schema)
  }

+  override def getOrCreateFileIndex(spark: SparkSession): InMemoryFileIndex = {
+    val rootPaths = if (snapshotId.isEmpty) {
+      if (table.spec().isUnpartitioned) {
+        Seq(PathUtils.makeAbsolute(table.location(), spark.sessionState.newHadoopConf()))
+      } else {
+        // We'd like to use a single root path here, but
+        // then the partition discovery fails
+        // because there is a directory called "metadata"
+        // under the root path.
+        fileScanTasks.map(t => toPath(t).getParent).distinct
+      }
+    } else {
+      // Listing all files with InMemoryFileIndex should be avoided,
+      // but there is no other way as there is no FileIndex implementation
+      // for Iceberg that supports snapshots.
+      fileScanTasks.map(toPath)
+    }
+    new InMemoryFileIndex(
+      spark,
+      rootPaths,
+      partitionBasePath
+        .map(PartitioningAwareFileIndex.BASE_PATH_PARAM -> _)
+        .toMap,
+      Some(schema),
+      FileStatusCache.getOrCreate(spark))
+  }
+
  /**
   * Creates [[HadoopFsRelation]] based on the current relation.
   *
@ -191,10 +218,26 @@ class IcebergRelation(
    new LogicalRelation(hadoopFsRelation, updatedOutput, None, false)
  }

-  private def toFileStatus(fileScanTask: FileScanTask): FileStatus = {
-    val path = PathUtils.makeAbsolute(
+  private def newScan(): TableScan = {
+    if (snapshotId.isDefined) {
+      table.newScan().useSnapshot(snapshotId.get)
+    } else {
+      table.newScan()
+    }
+  }
+
+  private def fileScanTasks: Seq[FileScanTask] = {
+    newScan().planFiles().iterator().asScala.toSeq
+  }
+
+  private def toPath(fileScanTask: FileScanTask): Path = {
+    PathUtils.makeAbsolute(
      new Path(fileScanTask.file().path().toString),
      spark.sessionState.newHadoopConf())
+  }
+
+  private def toFileStatus(fileScanTask: FileScanTask): FileStatus = {
+    val path = toPath(fileScanTask)
    val fs = path.getFileSystem(spark.sessionState.newHadoopConf())
    val fullPath = if (!path.isAbsolute) {
      new Path(s"${fs.getWorkingDirectory.toString}/${path.toString}")
--- a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala
+++ b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala
@ -105,6 +105,11 @@ trait FileBasedRelation extends SourceRelation {
   */
  def partitionBasePath: Option[String]

+  /**
+   * Returns [[FileIndex]] for the current relation.
+   */
+  def getOrCreateFileIndex(spark: SparkSession): FileIndex
+
  /**
   * Creates [[HadoopFsRelation]] based on the current relation.
   *
--- a/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala
+++ b/src/main/scala/com/microsoft/hyperspace/util/HyperspaceConf.scala
@ -130,6 +130,49 @@ object HyperspaceConf {
      }
      longValue
    }
+
+    def maxIndexDataFileCount(spark: SparkSession): Int = {
+      // TODO: Consider using a systematic way to validate the config value
+      // like Spark's ConfigBuilder
+      val value = spark.conf
+        .get(
+          IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT,
+          IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT_DEFAULT)
+      val intValue =
+        try {
+          value.toInt
+        } catch {
+          case e: NumberFormatException =>
+            throw HyperspaceException(
+              s"${IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT} " +
+                s"should be int, but was $value")
+        }
+      if (intValue <= 0) {
+        throw HyperspaceException(
+          s"${IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT} " +
+            s"should be a positive number.")
+      }
+      intValue
+    }
+
+    def autoPartitionSketch(spark: SparkSession): Boolean = {
+      // TODO: Consider using a systematic way to validate the config value
+      // like Spark's ConfigBuilder
+      val value = spark.conf
+        .get(
+          IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH,
+          IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH_DEFAULT)
+      val booleanValue =
+        try {
+          value.toBoolean
+        } catch {
+          case e: IllegalArgumentException =>
+            throw HyperspaceException(
+              s"${IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH} " +
+                s"should be boolean, but was $value")
+        }
+      booleanValue
+    }
  }

  /**
--- a/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala
+++ b/src/main/scala/org/apache/spark/sql/hyperspace/utils/package.scala
@ -34,5 +34,6 @@ package object utils {
  implicit class StructTypeUtils(st: StructType) {
    // Expose package-private method
    def merge(that: StructType): StructType = st.merge(that)
+    def sameType(that: StructType): Boolean = st.sameType(that)
  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/IcebergTestUtils.scala
+++ b/src/test/scala/com/microsoft/hyperspace/IcebergTestUtils.scala
@ -0,0 +1,43 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace
+
+import scala.collection.JavaConverters._
+
+import org.apache.iceberg.{PartitionSpec => IcebergPartitionSpec, Table, TableProperties}
+import org.apache.iceberg.hadoop.HadoopTables
+import org.apache.iceberg.spark.SparkSchemaUtil
+import org.apache.spark.sql.DataFrame
+
+trait IcebergTestUtils {
+  def createIcebergTable(dataPath: String, sourceDf: DataFrame): Table = {
+    val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava
+    val schema = SparkSchemaUtil.convert(sourceDf.schema)
+    val part = IcebergPartitionSpec.builderFor(schema).build()
+    new HadoopTables().create(schema, part, props, dataPath)
+  }
+
+  def createIcebergTableWithPartitions(
+      dataPath: String,
+      sourceDf: DataFrame,
+      partCol: String): Table = {
+    val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava
+    val schema = SparkSchemaUtil.convert(sourceDf.schema)
+    val part = IcebergPartitionSpec.builderFor(schema).identity(partCol).build()
+    new HadoopTables().create(schema, part, props, dataPath)
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/E2EHyperspaceRulesTest.scala
@ -30,7 +30,7 @@ import com.microsoft.hyperspace.index.IndexConstants.{GLOBBING_PATTERN_KEY, REFR
 import com.microsoft.hyperspace.index.IndexLogEntryTags._
 import com.microsoft.hyperspace.index.covering.JoinIndexRule
 import com.microsoft.hyperspace.index.dataskipping.DataSkippingIndexConfig
-import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch
+import com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch
 import com.microsoft.hyperspace.index.execution.BucketUnionStrategy
 import com.microsoft.hyperspace.index.rules.{ApplyHyperspace, CandidateIndexCollector}
 import com.microsoft.hyperspace.util.PathUtils
--- a/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/IcebergIntegrationTest.scala
@ -19,22 +19,19 @@ package com.microsoft.hyperspace.index
 import org.apache.commons.lang.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.iceberg.{PartitionSpec => IcebergPartitionSpec, Table, TableProperties}
-import org.apache.iceberg.hadoop.HadoopTables
-import org.apache.iceberg.spark.SparkSchemaUtil
 import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources._
 import scala.collection.JavaConverters._

-import com.microsoft.hyperspace.{Hyperspace, Implicits, SampleData, TestConfig}
+import com.microsoft.hyperspace.{Hyperspace, IcebergTestUtils, Implicits, SampleData, TestConfig}
 import com.microsoft.hyperspace.TestUtils.latestIndexLogEntry
 import com.microsoft.hyperspace.index.IndexConstants.REFRESH_MODE_QUICK
 import com.microsoft.hyperspace.index.plananalysis.{PlainTextMode, PlanAnalyzer}
 import com.microsoft.hyperspace.util.PathUtils
 import com.microsoft.hyperspace.util.PathUtils.DataPathFilter

-class IcebergIntegrationTest extends QueryTest with HyperspaceSuite {
+class IcebergIntegrationTest extends QueryTest with HyperspaceSuite with IcebergTestUtils {
  override val indexLocationDirName = "icebergIntegrationTest"

  private val sampleData = SampleData.testData
@ -131,7 +128,7 @@ class IcebergIntegrationTest extends QueryTest with HyperspaceSuite {
      val dfFromSample = testData
        .toDF("Date", "RGUID", "Query", "imprs", "clicks")

-      createIcebergTableWithDayPartition(testPath, dfFromSample)
+      createIcebergTableWithPartitions(testPath, dfFromSample, "Date")

      dfFromSample.write
        .format("iceberg")
@ -387,20 +384,6 @@ class IcebergIntegrationTest extends QueryTest with HyperspaceSuite {
      rootPaths.exists(_.toString.contains(p)))
  }

-  def createIcebergTable(dataPath: String, sourceDf: DataFrame): Table = {
-    val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava
-    val schema = SparkSchemaUtil.convert(sourceDf.schema)
-    val part = IcebergPartitionSpec.builderFor(schema).build()
-    new HadoopTables().create(schema, part, props, dataPath)
-  }
-
-  def createIcebergTableWithDayPartition(dataPath: String, sourceDf: DataFrame): Table = {
-    val props = Map(TableProperties.WRITE_NEW_DATA_LOCATION -> dataPath).asJava
-    val schema = SparkSchemaUtil.convert(sourceDf.schema)
-    val part = IcebergPartitionSpec.builderFor(schema).identity("Date").build()
-    new HadoopTables().create(schema, part, props, dataPath)
-  }
-
  private def truncate(s: String): String = {
    StringUtils.abbreviate(s, 100)
  }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala
@ -18,11 +18,11 @@ package com.microsoft.hyperspace.index.dataskipping

 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.functions.{input_file_name, max, min}
-import org.apache.spark.sql.types.{LongType, StringType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType}

 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.index.IndexConstants
-import com.microsoft.hyperspace.index.dataskipping.sketch._
+import com.microsoft.hyperspace.index.dataskipping.sketches._

 class DataSkippingIndexConfigTest extends DataSkippingSuite {
  test("indexName returns the index name.") {
@ -92,6 +92,46 @@ class DataSkippingIndexConfigTest extends DataSkippingSuite {
    assert(index.sketches === Seq(MinMaxSketch("Foo", Some(LongType))))
  }

+  test("createIndex creates partition sketches for partitioned source data.") {
+    val sourceData =
+      createPartitionedSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B"), Seq("A"))
+    val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("B"))
+    val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
+    assert(
+      index.sketches === Seq(
+        PartitionSketch(Seq(("A", Some(IntegerType)))),
+        MinMaxSketch("B", Some(LongType))))
+  }
+
+  test(
+    "createIndex creates partition sketches for partitioned source data " +
+      "with multiple partition columns.") {
+    val sourceData =
+      createPartitionedSourceData(
+        spark.range(10).selectExpr("id as A", "id as B", "id * 2 as C"),
+        Seq("A", "B"))
+    val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("C"))
+    val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
+    assert(
+      index.sketches === Seq(
+        PartitionSketch(Seq(("A", Some(IntegerType)), ("B", Some(IntegerType)))),
+        MinMaxSketch("C", Some(LongType))))
+  }
+
+  test(
+    "createIndex does not create partition sketches for partitioned source data " +
+      "if the config is turned off.") {
+    withSQLConf(IndexConstants.DATASKIPPING_AUTO_PARTITION_SKETCH -> "false") {
+      val sourceData =
+        createPartitionedSourceData(
+          spark.range(10).selectExpr("id as A", "id * 2 as B"),
+          Seq("A"))
+      val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("B"))
+      val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
+      assert(index.sketches === Seq(MinMaxSketch("B", Some(LongType))))
+    }
+  }
+
  test("createIndex throws an error if the data type is wrong.") {
    val sourceData = createSourceData(spark.range(10).toDF("Foo"))
    val indexConfig = DataSkippingIndexConfig("MyIndex", MinMaxSketch("foO", Some(StringType)))
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexIntegrationTest.scala
@ -16,14 +16,45 @@

 package com.microsoft.hyperspace.index.dataskipping

-import com.microsoft.hyperspace.HyperspaceException
-import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch
+import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{DataFrame, SaveMode}
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.types._

-class DataSkippingIndexIntegrationTest extends DataSkippingSuite {
+import com.microsoft.hyperspace._
+import com.microsoft.hyperspace.index.IndexConstants
+import com.microsoft.hyperspace.index.covering.CoveringIndexConfig
+import com.microsoft.hyperspace.index.dataskipping.sketches._
+import com.microsoft.hyperspace.index.plans.logical.IndexHadoopFsRelation
+import com.microsoft.hyperspace.shim.ExtractFileSourceScanExecRelation
+
+class DataSkippingIndexIntegrationTest extends DataSkippingSuite with IcebergTestUtils {
  import spark.implicits._

  override val numParallelism: Int = 10

+  test("MinMax index is applied for a filter query (EqualTo).") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 1")
+    checkIndexApplied(query, 1)
+  }
+
+  test("Empty relation is returned if no files match the index predicate.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = -1")
+    checkIndexApplied(query, 0)
+  }
+
+  test("MinMax index is applied for a filter query (EqualTo) with expression.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A + B")))
+    def query: DataFrame = df.filter("A+B < 40")
+    checkIndexApplied(query, 2)
+  }
+
  test("Non-deterministic expression is blocked.") {
    val df = createSourceData(spark.range(100).toDF("A"))
    val ex = intercept[HyperspaceException](
@ -88,4 +119,554 @@ class DataSkippingIndexIntegrationTest extends DataSkippingSuite {
        "DataSkippingIndex does not support indexing an expression which does not " +
          "reference source columns: myfunc()"))
  }
+
+  test("MinMax index is applied for a filter query (EqualTo) with UDF.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    spark.udf.register("myfunc", (a: Int) => a * 2)
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("myfunc(A)")))
+    def query: DataFrame = df.filter("myfunc(A) = 10")
+    checkIndexApplied(query, 1)
+  }
+
+  test("UDF matching is based on the name, not the actual lambda object.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    spark.udf.register("myfunc", (a: Int) => a * 2)
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("myfunc(A)")))
+    // Register a new function with the same semantics.
+    spark.udf.register("myfunc", (a: Int) => 2 * a)
+    def query: DataFrame = df.filter("myfunc(A) = 10")
+    checkIndexApplied(query, 1)
+  }
+
+  test("MinMax index is not applied for a filter query if it is not applicable.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B")))
+    def query: DataFrame = df.filter("A = 1")
+    checkIndexApplied(query, numParallelism)
+  }
+
+  test("MinMax index is not applied for a filter query if the filter condition is unsuitable.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 1 or B = 2")
+    checkIndexApplied(query, numParallelism)
+  }
+
+  test("MinMax index is not applied for a filter query if the filter condition is IsNull.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A is null")
+    checkIndexApplied(query, numParallelism)
+  }
+
+  test("Multiple indexes are applied to multiple filters.") {
+    val df = createSourceData(spark.range(100).toDF("A"), path = "TA")
+    val df2 = createSourceData(spark.range(100, 200).toDF("B"), path = "TB")
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    hs.createIndex(df2, DataSkippingIndexConfig("myind2", MinMaxSketch("B")))
+    def query: DataFrame = df.filter("A = 10").union(df2.filter("B = 110"))
+    checkIndexApplied(query, 2)
+  }
+
+  test("Single index is applied to multiple filters.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 10").union(df.filter("A = 20"))
+    checkIndexApplied(query, 2)
+  }
+
+  test("Single index is applied to a single filter.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 10").union(df.filter("B = 120"))
+    checkIndexApplied(query, numParallelism + 1)
+  }
+
+  test(
+    "DataSkippingIndex works correctly for CSV where the same source data files can be " +
+      "interpreted differently.") {
+    // String order: 1 < 10 < 2
+    // Int order: 1 < 2 < 10
+    createFile(dataPath("1.csv"), Seq("a", "1", "2", "10").mkString("\n").getBytes())
+    createFile(dataPath("2.csv"), Seq("a", "3", "4", "5").mkString("\n").getBytes())
+    val paths = Seq(dataPath("1.csv").toString, dataPath("2.csv").toString)
+    val dfString = spark.read.option("header", "true").csv(paths: _*)
+    assert(dfString.schema.head.dataType === StringType)
+    val dfInt = spark.read.option("header", "true").option("inferSchema", "true").csv(paths: _*)
+    assert(dfInt.schema.head.dataType === IntegerType)
+
+    withIndex("myind") {
+      hs.createIndex(dfString, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      checkIndexApplied(dfString.filter("A = 3"), 2)
+      checkIndexApplied(dfString.filter("A = 10"), 2)
+      checkIndexApplied(dfString.filter("A = '3'"), 1)
+      checkIndexApplied(dfString.filter("A = '10'"), 1)
+      checkIndexApplied(dfInt.filter("A = 3"), 2)
+      checkIndexApplied(dfInt.filter("A = 10"), 2)
+      checkIndexApplied(dfInt.filter("A = '3'"), 2)
+      checkIndexApplied(dfInt.filter("A = '10'"), 2)
+    }
+    withIndex("myind") {
+      hs.createIndex(dfInt, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      checkIndexApplied(dfString.filter("A = 3"), 2)
+      checkIndexApplied(dfString.filter("A = 10"), 2)
+      checkIndexApplied(dfString.filter("A = '3'"), 2)
+      checkIndexApplied(dfString.filter("A = '10'"), 2)
+      checkIndexApplied(dfInt.filter("A = 3"), 2)
+      checkIndexApplied(dfInt.filter("A = 10"), 1)
+      checkIndexApplied(dfInt.filter("A = '3'"), 2)
+      checkIndexApplied(dfInt.filter("A = '10'"), 1)
+    }
+  }
+
+  test("MinMax index is applied for a filter query (EqualTo) with selection.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 1").select("B")
+    checkIndexApplied(query, 1)
+  }
+
+  test("MinMax index can be refreshed (mode = incremental).") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append)
+    hs.refreshIndex("myind", "incremental")
+    def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123")
+    checkIndexApplied(query, 2)
+    assert(numIndexDataFiles("myind") === 2)
+  }
+
+  test("MinMax index can be refreshed (mode = full).") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append)
+    hs.refreshIndex("myind", "full")
+    def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123")
+    checkIndexApplied(query, 2)
+    assert(numIndexDataFiles("myind") === 1)
+  }
+
+  test("MinMax index can be refreshed (mode = full) for partitioned data.") {
+    val df = createPartitionedSourceData(
+      spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B"),
+      Seq("B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    createPartitionedSourceData(
+      spark.range(100, 200).selectExpr("id as A", "cast(id / 15 as int) as B"),
+      Seq("B"),
+      saveMode = SaveMode.Append)
+    hs.refreshIndex("myind", "full")
+    def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123")
+    checkIndexApplied(query, 2)
+    assert(numIndexDataFiles("myind") === 1)
+  }
+
+  test(
+    "MinMax index can be applied without refresh when source files are added " +
+      "if hybrid scan is enabled.") {
+    withSQLConf(
+      IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true",
+      IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "1") {
+      val df = createSourceData(spark.range(100).toDF("A"))
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append)
+      def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123")
+      checkIndexApplied(query, 11)
+    }
+  }
+
+  test("Empty source data does not cause an error.") {
+    val df = createSourceData(spark.range(0).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 1")
+    checkIndexApplied(query, 1)
+  }
+
+  test("Empty source data followed by refresh incremental works as expected.") {
+    val df = createSourceData(spark.range(0).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    createSourceData(spark.range(100).toDF("A"), saveMode = SaveMode.Append)
+    hs.refreshIndex("myind", "incremental")
+    def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1")
+    checkIndexApplied(query, 2)
+  }
+
+  test("MinMax index can be optimized.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    createSourceData(spark.range(100, 200).toDF("A"), saveMode = SaveMode.Append)
+    hs.refreshIndex("myind", "incremental")
+    assert(numIndexDataFiles("myind") === 2)
+    hs.optimizeIndex("myind")
+    assert(numIndexDataFiles("myind") === 1)
+    def query: DataFrame = spark.read.parquet(dataPath().toString).filter("A = 1 OR A = 123")
+    checkIndexApplied(query, 2)
+  }
+
+  test("CoveringIndex is applied if both CoveringIndex and DataSkippingIndex are applicable.") {
+    val df = createSourceData(spark.range(100).toDF("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("A")))
+    hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil))
+    spark.enableHyperspace
+    def query: DataFrame = df.filter("A = 1 or A = 50")
+    val rel = query.queryExecution.optimizedPlan.collect {
+      case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel
+    }
+    assert(rel.map(_.indexName) === Seq("ci"))
+    checkAnswer(query, Seq(1, 50).toDF("A"))
+  }
+
+  test("DataSkippingIndex is applied if CoveringIndex is not applicable.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("A")))
+    hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil))
+    spark.enableHyperspace
+    def query: DataFrame = df.filter("A = 1 or A = 50")
+    val rel = query.queryExecution.optimizedPlan.collect {
+      case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel
+    }
+    assert(rel.map(_.indexName) === Seq("ds"))
+    checkAnswer(query, Seq((1, 2), (50, 100)).toDF("A", "B"))
+  }
+
+  test("Both CoveringIndex and DataSkippnigIndex can be applied.") {
+    val df = createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B"))
+    hs.createIndex(df, CoveringIndexConfig("ci", Seq("A"), Nil))
+    hs.createIndex(df, DataSkippingIndexConfig("ds", MinMaxSketch("B")))
+    spark.enableHyperspace
+    def query: DataFrame = df.filter("A = 1").select("A").union(df.filter("B = 100").select("A"))
+    val rel = query.queryExecution.optimizedPlan.collect {
+      case LogicalRelation(rel: IndexHadoopFsRelation, _, _, _) => rel
+    }
+    assert(rel.map(_.indexName).sorted === Seq("ci", "ds"))
+    checkAnswer(query, Seq(1, 50).toDF("A"))
+  }
+
+  test("DataSkippingIndex works correctly with files having special characters in their name.") {
+    assume(!Path.WINDOWS)
+    val df = createSourceData(spark.range(100).toDF("A"), "table ,.;'`~!@#$%^&()_+|\"<>")
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+    def query: DataFrame = df.filter("A = 1")
+    checkIndexApplied(query, 1)
+  }
+
+  test("DataSkippingIndex works correctly with catalog tables") {
+    withTable("T") {
+      spark.range(100).toDF("A").write.saveAsTable("T")
+      val df = spark.read.table("T")
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      def query: DataFrame = df.filter("A = 1")
+      checkIndexApplied(query, 1)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with partitioned data.") {
+    val df = createPartitionedSourceData(
+      spark.range(1000).selectExpr("cast(id/10 as int) as A", "id as B"),
+      Seq("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B")))
+    def query: DataFrame = df.filter("A = 1 or B = 100")
+    checkIndexApplied(query, 2)
+  }
+
+  test(
+    "DataSkippingIndex works correctly with partitioned data " +
+      "with multiple partition columns.") {
+    val df = createPartitionedSourceData(
+      spark
+        .range(1000)
+        .selectExpr("cast(id/100 as int) as A", "cast(id/10 as int) as B", "id as C"),
+      Seq("A", "B"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("C")))
+    def query: DataFrame = df.filter("A = 1 or B = 1 or C = 1")
+    checkIndexApplied(query, 12)
+  }
+
+  test(
+    "DataSkippingIndex works correctly with partitioned data " +
+      "with a different filter condition.") {
+    val df = createPartitionedSourceData(
+      spark.range(1000).selectExpr("cast(id/200 as int)*200 as A", "id as B"),
+      Seq("A"))
+    hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("B")))
+    def query: DataFrame = df.filter("A = B")
+    checkIndexApplied(query, 5)
+  }
+
+  test("DataSkippingIndex works correctly with Delta Lake tables.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder") {
+      val df = createSourceData(spark.range(100).toDF("A"), format = "delta")
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      def query: DataFrame = df.filter("A = 1")
+      checkIndexApplied(query, 1)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with partitioned Delta Lake tables.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder") {
+      val df = createPartitionedSourceData(
+        spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B"),
+        Seq("B"),
+        format = "delta")
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      def query: DataFrame = df.filter("A = 1 or B = 5")
+      checkIndexApplied(query, 2)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with Delta time travel.") {
+    withTable("T") {
+      withSQLConf(
+        "spark.hyperspace.index.sources.fileBasedBuilders" ->
+          "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder",
+        IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true",
+        IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10",
+        IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") {
+
+        // version 0
+        spark.range(100).toDF("A").write.format("delta").save(dataPath("T").toString)
+
+        // version 1
+        spark
+          .range(100, 200)
+          .toDF("A")
+          .write
+          .format("delta")
+          .mode("append")
+          .save(dataPath("T").toString)
+
+        // version 2
+        spark
+          .range(200, 300)
+          .toDF("A")
+          .write
+          .format("delta")
+          .mode("append")
+          .save(dataPath("T").toString)
+
+        val df = (v: Int) =>
+          spark.read.format("delta").option("versionAsOf", v).load(dataPath("T").toString)
+
+        // Create an index with version 1 data
+        hs.createIndex(df(1), DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+
+        def query0: DataFrame = df(0).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query0, 1)
+
+        def query1: DataFrame = df(1).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query1, 2)
+
+        def query2: DataFrame = df(2).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query2, 12)
+      }
+    }
+  }
+
+  test("DataSkippingIndex works correctly with Delta time travel with partitions.") {
+    withTable("T") {
+      withSQLConf(
+        "spark.hyperspace.index.sources.fileBasedBuilders" ->
+          "com.microsoft.hyperspace.index.sources.delta.DeltaLakeFileBasedSourceBuilder",
+        IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true",
+        IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10",
+        IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") {
+
+        // version 0
+        spark
+          .range(100)
+          .selectExpr("id as A", "cast(id / 10 as int) as B")
+          .write
+          .format("delta")
+          .save(dataPath("T").toString)
+
+        // version 1
+        spark
+          .range(100, 200)
+          .selectExpr("id as A", "cast(id / 15 as int) as B")
+          .write
+          .format("delta")
+          .mode("append")
+          .save(dataPath("T").toString)
+
+        // version 2
+        spark
+          .range(200, 300)
+          .selectExpr("id as A", "cast(id / 20 as int) as B")
+          .write
+          .format("delta")
+          .mode("append")
+          .save(dataPath("T").toString)
+
+        val df = (v: Int) =>
+          spark.read.format("delta").option("versionAsOf", v).load(dataPath("T").toString)
+
+        // Create an index with version 1 data
+        hs.createIndex(df(1), DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+
+        def query0: DataFrame = df(0).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query0, 1)
+
+        def query1: DataFrame = df(1).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query1, 2)
+
+        def query2: DataFrame = df(2).filter("A = 1 or A = 101 or A = 201")
+        checkIndexApplied(query2, 12)
+      }
+    }
+  }
+
+  test("DataSkippingIndex works correctly with Iceberg tables.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder") {
+      val data = spark.range(100).toDF("A")
+      createIcebergTable(dataPath("T").toString, data)
+      val df = createSourceData(data, format = "iceberg")
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      def query: DataFrame = df.filter("A = 1")
+      checkIndexApplied(query, 1)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with partitioned Iceberg tables.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder") {
+      val data = spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B")
+      createIcebergTableWithPartitions(dataPath("T").toString, data, "B")
+      val df = createPartitionedSourceData(data, Seq("B"), format = "iceberg")
+      hs.createIndex(df, DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+      def query: DataFrame = df.filter("A = 1")
+      checkIndexApplied(query, 1)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with Iceberg time travel.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder",
+      IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true",
+      IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10",
+      IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") {
+
+      // version 0
+      val data = spark.range(100).toDF("A")
+      val table = createIcebergTable(dataPath("T").toString, data)
+      data.write.format("iceberg").mode("overwrite").save(dataPath("T").toString)
+      val v0 = table.currentSnapshot.snapshotId
+
+      // version 1
+      spark
+        .range(100, 200)
+        .toDF("A")
+        .write
+        .format("iceberg")
+        .mode("append")
+        .save(dataPath("T").toString)
+      table.newTransaction().commitTransaction()
+      val v1 = table.currentSnapshot.snapshotId
+
+      // version 2
+      spark
+        .range(200, 300)
+        .toDF("A")
+        .write
+        .format("iceberg")
+        .mode("append")
+        .save(dataPath("T").toString)
+      table.newTransaction().commitTransaction()
+      val v2 = table.currentSnapshot.snapshotId
+
+      val df = (v: Long) =>
+        spark.read.format("iceberg").option("snapshot-id", v).load(dataPath("T").toString)
+
+      // Create an index with version 1 data
+      hs.createIndex(df(v1), DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+
+      def query0: DataFrame = df(v0).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query0, 1)
+
+      def query1: DataFrame = df(v1).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query1, 2)
+
+      def query2: DataFrame = df(v2).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query2, 12)
+    }
+  }
+
+  test("DataSkippingIndex works correctly with Iceberg time travel with partitions.") {
+    withSQLConf(
+      "spark.hyperspace.index.sources.fileBasedBuilders" ->
+        "com.microsoft.hyperspace.index.sources.iceberg.IcebergFileBasedSourceBuilder",
+      IndexConstants.INDEX_HYBRID_SCAN_ENABLED -> "true",
+      IndexConstants.INDEX_HYBRID_SCAN_APPENDED_RATIO_THRESHOLD -> "10",
+      IndexConstants.INDEX_HYBRID_SCAN_DELETED_RATIO_THRESHOLD -> "10") {
+
+      // version 0
+      val data = spark.range(100).selectExpr("id as A", "cast(id / 10 as int) as B")
+      val table = createIcebergTable(dataPath("T").toString, data)
+      data.write.format("iceberg").mode("overwrite").save(dataPath("T").toString)
+      val v0 = table.currentSnapshot.snapshotId
+
+      // version 1
+      spark
+        .range(100, 200)
+        .selectExpr("id as A", "cast(id / 15 as int) as B")
+        .write
+        .format("iceberg")
+        .mode("append")
+        .save(dataPath("T").toString)
+      table.newTransaction().commitTransaction()
+      val v1 = table.currentSnapshot.snapshotId
+
+      // version 2
+      spark
+        .range(200, 300)
+        .selectExpr("id as A", "cast(id / 20 as int) as B")
+        .write
+        .format("iceberg")
+        .mode("append")
+        .save(dataPath("T").toString)
+      table.newTransaction().commitTransaction()
+      val v2 = table.currentSnapshot.snapshotId
+
+      val df = (v: Long) =>
+        spark.read.format("iceberg").option("snapshot-id", v).load(dataPath("T").toString)
+
+      // Create an index with version 1 data
+      hs.createIndex(df(v1), DataSkippingIndexConfig("myind", MinMaxSketch("A")))
+
+      def query0: DataFrame = df(v0).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query0, 1)
+
+      def query1: DataFrame = df(v1).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query1, 2)
+
+      def query2: DataFrame = df(v2).filter("A = 1 or A = 101 or A = 201")
+      checkIndexApplied(query2, 12)
+    }
+  }
+
+  def checkIndexApplied(query: => DataFrame, numExpectedFiles: Int): Unit = {
+    withClue(s"query = ${query.queryExecution.logical}numExpectedFiles = $numExpectedFiles\n") {
+      spark.disableHyperspace
+      val queryWithoutIndex = query
+      queryWithoutIndex.collect()
+      spark.enableHyperspace
+      val queryWithIndex = query
+      queryWithIndex.collect()
+      checkAnswer(queryWithIndex, queryWithoutIndex)
+      assert(numAccessedFiles(queryWithIndex) === numExpectedFiles)
+    }
+  }
+
+  def numIndexDataFiles(name: String): Int = {
+    val manager = Hyperspace.getContext(spark).indexCollectionManager
+    val latestVersion = manager.getIndexVersions(name, Seq("ACTIVE")).max
+    manager.getIndex(name, latestVersion).get.content.files.length
+  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexTest.scala
@ -16,56 +16,60 @@

 package com.microsoft.hyperspace.index.dataskipping

+import scala.util.parsing.json.JSON
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.SaveMode
 import org.apache.spark.sql.functions.{input_file_name, max, min}
-import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.types.{IntegerType, StructType}

 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.index.{Content, FileInfo, Index, IndexConstants}
-import com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch
+import com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch
 import com.microsoft.hyperspace.util.JsonUtils

 class DataSkippingIndexTest extends DataSkippingSuite {
-  override val numParallelism: Int = 3
+  override val numParallelism: Int = 10
+
+  val emptyStructType = new StructType()

  test("""kind returns "DataSkippingIndex".""") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType)
    assert(index.kind === "DataSkippingIndex")
  }

  test("""kindAbbr returns "DS".""") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType)
    assert(index.kindAbbr === "DS")
  }

  test("indexedColumns returns indexed columns of sketches.") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType)
    assert(index.indexedColumns === Seq("A", "B"))
  }

  test("referencedColumns returns indexed columns of sketches.") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType)
    assert(index.referencedColumns === Seq("A", "B"))
  }

  test(
    "withNewProperties returns a new index which copies the original index except the " +
      "properties.") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType)
    val newIndex = index.withNewProperties(Map("foo" -> "bar"))
    assert(newIndex.properties === Map("foo" -> "bar"))
    assert(newIndex.sketches === index.sketches)
  }

  test("statistics returns a string-formatted list of sketches.") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType)
    assert(index.statistics() === Map("sketches" -> "MinMax(A), MinMax(B)"))
  }

  test("canHandleDeletedFiles returns true.") {
-    val index = DataSkippingIndex(Seq(MinMaxSketch("A")))
+    val index = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType)
    assert(index.canHandleDeletedFiles === true)
  }

@ -78,6 +82,14 @@ class DataSkippingIndexTest extends DataSkippingSuite {
    checkAnswer(writtenIndexData, indexData)
  }

+  test("write throws an exception if the index data schema doesn't match.") {
+    val sourceData = createSourceData(spark.range(100).toDF("A"))
+    val indexConfig = DataSkippingIndexConfig("myIndex", MinMaxSketch("A"))
+    val (index, _) = indexConfig.createIndex(ctx, sourceData, Map())
+    val ex = intercept[IllegalArgumentException](index.write(ctx, spark.range(10).toDF("B")))
+    assert(ex.getMessage.contains("Schema of the index data doesn't match the index schema"))
+  }
+
  test("optimize reduces the number of index data files.") {
    val targetIndexDataFileSize = 100000
    val expectedNumIndexDataFiles = 1
@ -112,16 +124,15 @@ class DataSkippingIndexTest extends DataSkippingSuite {
    }
  }

-  test("write throws an exception if target index data file size is too small.") {
-    withSQLConf(IndexConstants.DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE -> "1") {
+  test("write does not create more files than maxIndexDataFileCount.") {
+    withSQLConf(
+      IndexConstants.DATASKIPPING_TARGET_INDEX_DATA_FILE_SIZE -> "1",
+      IndexConstants.DATASKIPPING_MAX_INDEX_DATA_FILE_COUNT -> "3") {
      val indexConfig = DataSkippingIndexConfig("myIndex", MinMaxSketch("A"))
-      val sourceData = createSourceData(spark.range(100).toDF("A"))
+      val sourceData = createSourceData(spark.range(10000).toDF("A"))
      val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
-      val mockIndexData = RDDTestUtils.getMockDataFrameWithFakeSize(spark, 4000000000L)
-      val ex = intercept[HyperspaceException](index.write(ctx, mockIndexData))
-      assert(
-        ex.getMessage.contains("Could not create index data files due to too many files: " +
-          "indexDataSize=4000000000, targetIndexDataFileSize=1"))
+      index.write(ctx, indexData)
+      assert(listFiles(indexDataPath).filter(isParquet).length === 3)
    }
  }

@ -231,38 +242,45 @@ class DataSkippingIndexTest extends DataSkippingSuite {
  }

  test("At least one sketch must be specified.") {
-    val ex = intercept[AssertionError](DataSkippingIndex(Nil))
+    val ex = intercept[AssertionError](DataSkippingIndex(Nil, emptyStructType))
    assert(ex.getMessage().contains("At least one sketch is required"))
  }

  test("Indexes are equal if they have the same sketches and data types.") {
-    val ds1 = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")))
-    val ds2 = DataSkippingIndex(Seq(MinMaxSketch("B"), MinMaxSketch("A")))
+    val ds1 = DataSkippingIndex(Seq(MinMaxSketch("A"), MinMaxSketch("B")), emptyStructType)
+    val ds2 = DataSkippingIndex(Seq(MinMaxSketch("B"), MinMaxSketch("A")), emptyStructType)
    assert(ds1 === ds2)
    assert(ds1.hashCode === ds2.hashCode)
  }

  test("Indexes are not equal to objects which are not indexes.") {
-    val ds = DataSkippingIndex(Seq(MinMaxSketch("A")))
+    val ds = DataSkippingIndex(Seq(MinMaxSketch("A")), emptyStructType)
    assert(ds !== "ds")
  }

  test("Index can be serialized.") {
-    val ds = DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType))), Map("a" -> "b"))
+    val ds = DataSkippingIndex(
+      Seq(MinMaxSketch("A", Some(IntegerType))),
+      emptyStructType,
+      Map("a" -> "b"))
    val json = JsonUtils.toJson(ds)
-    assert(
-      json ===
+    val expectedJson =
      """|{
         |  "type" : "com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex",
         |  "sketches" : [ {
-           |    "type" : "com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch",
+         |    "type" : "com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch",
         |    "expr" : "A",
         |    "dataType" : "integer"
         |  } ],
+         |  "schema" : {
+         |    "type" : "struct",
+         |    "fields" : [ ]
+         |  },
         |  "properties" : {
         |    "a" : "b"
         |  }
-           |}""".stripMargin)
+         |}""".stripMargin
+    assert(JSON.parseFull(json) === JSON.parseFull(expectedJson))
  }

  test("Index can be deserialized.") {
@ -270,16 +288,20 @@ class DataSkippingIndexTest extends DataSkippingSuite {
      """|{
         |  "type" : "com.microsoft.hyperspace.index.dataskipping.DataSkippingIndex",
         |  "sketches" : [ {
-         |    "type" : "com.microsoft.hyperspace.index.dataskipping.sketch.MinMaxSketch",
+         |    "type" : "com.microsoft.hyperspace.index.dataskipping.sketches.MinMaxSketch",
         |    "expr" : "A",
         |    "dataType" : "integer"
         |  } ],
+         |  "schema" : {
+         |    "type" : "struct",
+         |    "fields" : [ ]
+         |  },
         |  "properties" : {
         |    "a" : "b"
         |  }
         |}""".stripMargin
    val ds = JsonUtils.fromJson[DataSkippingIndex](json)
-    assert(ds === DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType)))))
+    assert(ds === DataSkippingIndex(Seq(MinMaxSketch("A", Some(IntegerType))), emptyStructType))
    assert(ds.properties === Map("a" -> "b"))
  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingSuite.scala
@ -21,6 +21,9 @@ import scala.collection.AbstractIterator
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path, RemoteIterator}
 import org.apache.spark.sql.{DataFrame, QueryTest, SaveMode, SparkSession}
+import org.apache.spark.sql.execution.DataSourceScanExec
+import org.apache.spark.sql.execution.datasources.FilePartition
+import org.apache.spark.sql.internal.SQLConf

 import com.microsoft.hyperspace.Hyperspace
 import com.microsoft.hyperspace.index._
@ -52,6 +55,7 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite {

  after {
    FileUtils.delete(tempDir)
+    spark.catalog.clearCache()
  }

  def dataPath(path: String = "T"): Path = new Path(dataPathRoot, path)
@ -60,16 +64,43 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite {
      originalData: DataFrame,
      path: String = "T",
      saveMode: SaveMode = SaveMode.Overwrite,
-      appendedDataOnly: Boolean = false): DataFrame = {
+      appendedDataOnly: Boolean = false,
+      format: String = "parquet"): DataFrame = {
    val p = dataPath(path)
    val oldFiles = listFiles(p).toSet
-    originalData.write.mode(saveMode).parquet(p.toString)
+    originalData.write.mode(saveMode).format(format).save(p.toString)
    updateFileIdTracker(p)
    if (appendedDataOnly) {
      val newFiles = listFiles(p).filterNot(oldFiles.contains)
-      spark.read.parquet(newFiles.map(_.getPath.toString): _*)
+      spark.read.format(format).load(newFiles.map(_.getPath.toString): _*)
    } else {
-      spark.read.parquet(p.toString)
+      spark.read.format(format).load(p.toString)
+    }
+  }
+
+  def createPartitionedSourceData(
+      originalData: DataFrame,
+      partitioningColumns: Seq[String],
+      path: String = "T",
+      saveMode: SaveMode = SaveMode.Overwrite,
+      appendedDataOnly: Boolean = false,
+      format: String = "parquet"): DataFrame = {
+    val p = dataPath(path)
+    val oldFiles = listFiles(p).toSet
+    originalData.write
+      .partitionBy(partitioningColumns: _*)
+      .mode(saveMode)
+      .format(format)
+      .save(p.toString)
+    updateFileIdTracker(p)
+    if (appendedDataOnly) {
+      val newFiles = listFiles(p).filterNot(oldFiles.contains)
+      spark.read
+        .option("basePath", p.toString)
+        .format(format)
+        .load(newFiles.map(_.getPath.toString): _*)
+    } else {
+      spark.read.format(format).load(p.toString)
    }
  }

@ -110,5 +141,55 @@ trait DataSkippingSuite extends QueryTest with HyperspaceSuite {
    fs.delete(path, true)
  }

-  def isParquet: FileStatus => Boolean = _.getPath.getName.endsWith(".parquet")
+  def createFile(path: Path, data: Array[Byte]): Unit = {
+    val fs = path.getFileSystem(new Configuration)
+    val out = fs.create(path)
+    out.write(data)
+    out.close()
+  }
+
+  def isParquet: FileStatus => Boolean = _.getPath.getName.endsWith(".parquet")
+
+  def withAndWithoutCodegen(testFun: => Unit): Unit = {
+    import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode._
+    Seq(false, true).foreach { codegenEnabled =>
+      withClue(s"codegenEnabled = $codegenEnabled") {
+        val mode = if (codegenEnabled) CODEGEN_ONLY else NO_CODEGEN
+        withSQLConf(
+          SQLConf.CODEGEN_FACTORY_MODE.key -> mode.toString,
+          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled.toString) {
+          testFun
+        }
+      }
+    }
+  }
+
+  def createIndexLogEntry(indexConfig: IndexConfigTrait, sourceData: DataFrame): IndexLogEntry = {
+    val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
+    index.write(ctx, indexData)
+    IndexLogEntry(
+      indexConfig.indexName,
+      index,
+      Content.fromDirectory(indexDataPath, fileIdTracker, new Configuration),
+      Source(
+        SparkPlan(SparkPlan.Properties(
+          Seq(RelationUtils
+            .getRelation(spark, sourceData.queryExecution.optimizedPlan)
+            .createRelationMetadata(fileIdTracker)),
+          null,
+          null,
+          LogicalPlanFingerprint(
+            LogicalPlanFingerprint.Properties(Seq(Signature("sp", "sig"))))))),
+      Map.empty)
+  }
+
+  def numAccessedFiles(df: DataFrame): Int = {
+    df.queryExecution.executedPlan.collect {
+      case scan: DataSourceScanExec =>
+        val files = scan.inputRDDs
+          .flatMap(
+            _.partitions.flatMap(_.asInstanceOf[FilePartition].files.map(_.filePath).toSet))
+        files.length
+    }.sum
+  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/RDDTestUtils.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/RDDTestUtils.scala
@ -1,38 +0,0 @@
-/*
- * Copyright (2021) The Hyperspace Project Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.microsoft.hyperspace.index.dataskipping
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.storage.RDDInfo
-import org.mockito.Mockito.{mock, spy, when}
-
-object RDDTestUtils {
-  def getMockDataFrameWithFakeSize(spark: SparkSession, size: Long): DataFrame = {
-    val df = spy(spark.emptyDataFrame)
-    val rdd = spy(df.rdd)
-    val mockSparkContext = mock(classOf[SparkContext])
-    val mockRddStorageInfo = mock(classOf[RDDInfo])
-    when(df.rdd).thenReturn(rdd)
-    when(rdd.id).thenReturn(42)
-    when(rdd.context).thenReturn(mockSparkContext)
-    when(mockSparkContext.getRDDStorageInfo).thenReturn(Array[RDDInfo](mockRddStorageInfo))
-    when(mockRddStorageInfo.id).thenReturn(42)
-    when(mockRddStorageInfo.memSize).thenReturn(size)
-    df
-  }
-}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndexTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/execution/DataSkippingFileIndexTest.scala
@ -0,0 +1,91 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.execution
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.types._
+import org.mockito.Mockito.{mock, verify, when}
+
+import com.microsoft.hyperspace.index.dataskipping._
+import com.microsoft.hyperspace.index.dataskipping.sketches._
+
+class DataSkippingFileIndexTest extends DataSkippingSuite {
+  test("DataSkippingFileIndex delegates methods to the FileIndex it is based on.") {
+    val baseFileIndex = mock(classOf[FileIndex])
+    val dsFileIndex = new DataSkippingFileIndex(
+      spark,
+      fileIdTracker,
+      spark.emptyDataFrame,
+      Literal.TrueLiteral,
+      baseFileIndex)
+    when(baseFileIndex.rootPaths).thenReturn(Seq(new Path("x")))
+    assert(dsFileIndex.rootPaths === Seq(new Path("x")))
+    when(baseFileIndex.inputFiles).thenReturn(Array("x/a", "x/b"))
+    assert(dsFileIndex.inputFiles === Array("x/a", "x/b"))
+    dsFileIndex.refresh()
+    verify(baseFileIndex).refresh()
+    when(baseFileIndex.sizeInBytes).thenReturn(12345)
+    assert(dsFileIndex.sizeInBytes === 12345)
+    val structType = StructType(StructField("A", IntegerType) :: Nil)
+    when(baseFileIndex.partitionSchema).thenReturn(structType)
+    assert(dsFileIndex.partitionSchema === structType)
+    when(baseFileIndex.metadataOpsTimeNs).thenReturn(Some(100L))
+    assert(dsFileIndex.metadataOpsTimeNs === Some(100L))
+  }
+
+  test("listFiles returns partition directories with filtered files.") {
+    val dsFileIndex = createDataSkippingFileIndex(
+      spark.range(100).toDF("A"),
+      MinMaxSketch("A"),
+      LessThanOrEqual(UnresolvedAttribute("MinMax_A__0"), Literal(1)))
+    val selectedPartitions = dsFileIndex.listFiles(Nil, Nil)
+    val allPartitions = dsFileIndex.baseFileIndex.listFiles(Nil, Nil)
+    assert(partitionsSize(selectedPartitions) < partitionsSize(allPartitions))
+    assert(partitionsContain(allPartitions, selectedPartitions))
+  }
+
+  def createDataSkippingFileIndex(
+      df: DataFrame,
+      sketch: Sketch,
+      indexDataPred: Expression): DataSkippingFileIndex = {
+    val sourceData = createSourceData(df)
+    val baseFileIndex = sourceData.queryExecution.optimizedPlan.collectFirst {
+      case LogicalRelation(HadoopFsRelation(location, _, _, _, _, _), _, _, _) => location
+    }.get
+    val (index, indexData) =
+      DataSkippingIndexConfig("myind", sketch).createIndex(ctx, sourceData, Map.empty)
+    new DataSkippingFileIndex(spark, fileIdTracker, indexData, indexDataPred, baseFileIndex)
+  }
+
+  def partitionsSize(partitions: Seq[PartitionDirectory]): Long = {
+    partitions.flatMap(_.files.map(_.getLen)).sum
+  }
+
+  def partitionsContain(
+      partitions: Seq[PartitionDirectory],
+      others: Seq[PartitionDirectory]): Boolean = {
+    val partitionsMap = partitions.map(pd => (pd.values, pd.files)).toMap
+    others.forall { pd =>
+      val files = partitionsMap.getOrElse(pd.values, Nil).toSet
+      pd.files.forall(f => files.contains(f))
+    }
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtilsTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExpressionUtilsTest.scala
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
@ -37,6 +37,22 @@ class ExpressionUtilsTest extends HyperspaceSuite {
    assert(ExpressionUtils.normalize(expr) === expected)
  }

+  test("normalize removes expressions inserted for UDF.") {
+    val arg = AttributeReference("A", IntegerType)(ExprId(42), Seq("t"))
+    val func = (x: Int) => x + 1
+    val expr = If(
+      IsNull(arg),
+      Literal(null, IntegerType),
+      ScalaUDF(func, IntegerType, Seq(KnownNotNull(arg)), Nil))
+    val expected =
+      ScalaUDF(
+        func,
+        IntegerType,
+        Seq(arg.withExprId(ExpressionUtils.nullExprId).withQualifier(Nil)),
+        Nil)
+    assert(ExpressionUtils.normalize(expr) === expected)
+  }
+
  test("ExtractIsNullDisjunction matches IsNull.") {
    val expr = IsNull(Literal(null))
    val args = expr match {
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExtractorsTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/ExtractorsTest.scala
@ -0,0 +1,205 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.expressions
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{BooleanType, IntegerType}
+import org.mockito.ArgumentMatchers.any
+import org.mockito.Mockito.{mock, when}
+
+import com.microsoft.hyperspace.index.HyperspaceSuite
+
+class ExtractorsTest extends HyperspaceSuite {
+  val a = AttributeReference("a", IntegerType)()
+  val b = AttributeReference("b", IntegerType)()
+  val c = AttributeReference("c", IntegerType)()
+  val d = AttributeReference("d", BooleanType)()
+  val e = AttributeReference("e", BooleanType)()
+
+  val aa = Literal(0)
+  val bb = Literal(1)
+  val dd = Literal(true)
+
+  def getExtractor(x: Expression, y: Expression): ExpressionExtractor = {
+    val extractor = mock(classOf[ExpressionExtractor])
+    when(extractor.unapply(any())).thenReturn(None)
+    when(extractor.unapply(x)).thenReturn(Some(y))
+    extractor
+  }
+
+  val aExtractor = getExtractor(a, aa)
+  val bExtractor = getExtractor(b, bb)
+  val dExtractor = getExtractor(d, dd)
+
+  test("EqualToExtractor matches EqualTo(a, b).") {
+    assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(a, b)) === Some((aa, bb)))
+  }
+
+  test("EqualToExtractor matches EqualTo(b, a).") {
+    assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(b, a)) === Some((aa, bb)))
+  }
+
+  test("EqualToExtractor does not match EqualTo(a, c).") {
+    assert(EqualToExtractor(aExtractor, bExtractor).unapply(EqualTo(a, c)) === None)
+  }
+
+  test("LessThanExtractor matches LessThan(a, b).") {
+    assert(LessThanExtractor(aExtractor, bExtractor).unapply(LessThan(a, b)) === Some((aa, bb)))
+  }
+
+  test("LessThanExtractor matches GreaterThan(b, a).") {
+    assert(
+      LessThanExtractor(aExtractor, bExtractor).unapply(GreaterThan(b, a)) ===
+        Some((aa, bb)))
+  }
+
+  test("LessThanExtractor does not match LessThan(b, a).") {
+    assert(LessThanExtractor(aExtractor, bExtractor).unapply(LessThan(b, a)) === None)
+  }
+
+  test("LessThanOrEqualExtractor matches LessThanOrEqual(a, b).") {
+    assert(
+      LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(LessThanOrEqual(a, b)) ===
+        Some((aa, bb)))
+  }
+
+  test("LessThanOrEqualExtractor matches GreaterThanOrEqual(b, a).") {
+    assert(
+      LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(GreaterThanOrEqual(b, a)) ===
+        Some((aa, bb)))
+  }
+
+  test("LessThanOrEqualExtractor does not match LessThanOrEqual(b, a).") {
+    assert(
+      LessThanOrEqualExtractor(aExtractor, bExtractor).unapply(LessThanOrEqual(b, a)) ===
+        None)
+  }
+
+  test("IsNullExtractor matches IsNull(a).") {
+    assert(IsNullExtractor(aExtractor).unapply(IsNull(a)) === Some(aa))
+  }
+
+  test("IsNullExtractor matches EqualNullSafe(a, null).") {
+    assert(IsNullExtractor(aExtractor).unapply(EqualNullSafe(a, Literal(null))) === Some(aa))
+  }
+
+  test("IsNullExtractor matches EqualNullSafe(null, a).") {
+    assert(IsNullExtractor(aExtractor).unapply(EqualNullSafe(Literal(null), a)) === Some(aa))
+  }
+
+  test("IsNullExtractor does not match IsNull(c).") {
+    assert(IsNullExtractor(aExtractor).unapply(IsNull(c)) === None)
+  }
+
+  test("IsNotNullExtractor matches IsNotNull(a).") {
+    assert(IsNotNullExtractor(aExtractor).unapply(IsNotNull(a)) === Some(aa))
+  }
+
+  test("IsNotNullExtractor matches Not(IsNull(a)).") {
+    assert(IsNotNullExtractor(aExtractor).unapply(Not(IsNull(a))) === Some(aa))
+  }
+
+  test("IsNotNullExtractor matches Not(EqualNullSafe(a, null)).") {
+    assert(
+      IsNotNullExtractor(aExtractor).unapply(Not(EqualNullSafe(a, Literal(null)))) ===
+        Some(aa))
+  }
+
+  test("IsNotNullExtractor matches Not(EqualNullSafe(null, a)).") {
+    assert(
+      IsNotNullExtractor(aExtractor).unapply(Not(EqualNullSafe(Literal(null), a))) ===
+        Some(aa))
+  }
+
+  test("IsNotNullExtractor does not match IsNotNull(c).") {
+    assert(IsNotNullExtractor(aExtractor).unapply(IsNotNull(c)) === None)
+  }
+
+  test("IsTrueExtractor matches d.") {
+    assert(IsTrueExtractor(dExtractor).unapply(d) === Some(dd))
+  }
+
+  test("IsTrueExtractor matches EqualTo(d, true).") {
+    assert(IsTrueExtractor(dExtractor).unapply(EqualTo(d, Literal(true))) === Some(dd))
+  }
+
+  test("IsTrueExtractor matches EqualTo(true, d).") {
+    assert(IsTrueExtractor(dExtractor).unapply(EqualTo(Literal(true), d)) === Some(dd))
+  }
+
+  test("IsTrueExtractor does not match e.") {
+    assert(IsTrueExtractor(dExtractor).unapply(e) === None)
+  }
+
+  test("IsTrueExtractor does not match a.") {
+    assert(IsTrueExtractor(dExtractor).unapply(a) === None)
+  }
+
+  test("IsFalseExtractor matches Not(d).") {
+    assert(IsFalseExtractor(dExtractor).unapply(Not(d)) === Some(dd))
+  }
+
+  test("IsFalseExtractor matches EqualTo(d, false).") {
+    assert(IsFalseExtractor(dExtractor).unapply(EqualTo(d, Literal(false))) === Some(dd))
+  }
+
+  test("IsFalseExtractor matches EqualTo(false, d).") {
+    assert(IsFalseExtractor(dExtractor).unapply(EqualTo(Literal(false), d)) === Some(dd))
+  }
+
+  test("IsFalseExtractor does not match Not(e).") {
+    assert(IsFalseExtractor(dExtractor).unapply(Not(e)) === None)
+  }
+
+  test("IsFalseExtractor does not match a.") {
+    assert(IsFalseExtractor(dExtractor).unapply(a) === None)
+  }
+
+  test("InExtractor matches In(a, Seq()).") {
+    assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq())) === Some((aa, Seq())))
+  }
+
+  test("InExtractor matches In(a, Seq(b)).") {
+    assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b))) === Some((aa, Seq(bb))))
+  }
+
+  test("InExtractor matches In(a, Seq(b, b)).") {
+    assert(
+      InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b, b))) ===
+        Some((aa, Seq(bb, bb))))
+  }
+
+  test("InExtractor does not match In(a, Seq(b, c)).") {
+    assert(InExtractor(aExtractor, bExtractor).unapply(In(a, Seq(b, c))) === None)
+  }
+
+  test("InExtractor does not match In(c, Seq(b, b)).") {
+    assert(InExtractor(aExtractor, bExtractor).unapply(In(c, Seq(b, b))) === None)
+  }
+
+  test("InSetExtractor matches InSet(a, Set()).") {
+    assert(InSetExtractor(aExtractor).unapply(InSet(a, Set[Any]())) === Some((aa, Set[Any]())))
+  }
+
+  test("InSetExtractor matches InSet(a, Set(1)).") {
+    assert(InSetExtractor(aExtractor).unapply(InSet(a, Set[Any](1))) === Some((aa, Set[Any](1))))
+  }
+
+  test("InSetExtractor does not match InSet(c, Set(1)).") {
+    assert(InSetExtractor(aExtractor).unapply(InSet(c, Set[Any](1))) === None)
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractorTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/NormalizedExprExtractorTest.scala
@ -14,23 +14,25 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._

 import com.microsoft.hyperspace.index.HyperspaceSuite

-class NormalizedExprMatcherTest extends HyperspaceSuite {
-  val matcher = NormalizedExprMatcher(
+class NormalizedExprExtractorTest extends HyperspaceSuite {
+  val extractor = NormalizedExprExtractor(
    AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId, Nil),
    Map(ExprId(42) -> "A"))

  test("apply returns true if the expression matches.") {
-    assert(matcher(AttributeReference("a", IntegerType)(ExprId(42), Nil)) === true)
+    assert(
+      extractor.unapply(AttributeReference("a", IntegerType)(ExprId(42), Nil)) ===
+        Some(extractor.expr))
  }

  test("apply returns false if the expression does not match") {
-    assert(matcher(Literal(42)) === false)
+    assert(extractor.unapply(Literal(42)) === None)
  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBoundTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/expressions/SortedArrayLowerBoundTest.scala
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-package com.microsoft.hyperspace.index.dataskipping.util
+package com.microsoft.hyperspace.index.dataskipping.expressions

 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
@ -84,4 +84,10 @@ class SortedArrayLowerBoundTest
    checkEvaluation(expr(IntegerType, Seq(1, 3), 4), null)
    checkEvaluation(expr(IntegerType, Seq(1, 3, 5), 6), null)
  }
+
+  test("SortedArrayLowerBound returns null if the value is null.") {
+    checkEvaluation(expr(IntegerType, Seq(1), null), null)
+    checkEvaluation(expr(IntegerType, Seq(1, 3), null), null)
+    checkEvaluation(expr(IntegerType, Seq(1, 3, 5), null), null)
+  }
 }
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndexTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/ApplyDataSkippingIndexTest.scala
@ -0,0 +1,372 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.hyperspace.utils.logicalPlanToDataFrame
+
+import com.microsoft.hyperspace.index._
+import com.microsoft.hyperspace.index.dataskipping._
+import com.microsoft.hyperspace.index.dataskipping.execution.DataSkippingFileIndex
+import com.microsoft.hyperspace.index.dataskipping.sketches._
+
+class ApplyDataSkippingIndexTest extends DataSkippingSuite {
+  import spark.implicits._
+
+  override val numParallelism: Int = 10
+
+  test("applyIndex returns the unmodified plan if no index is given.") {
+    val sourceData = createSourceData(spark.range(100).toDF("A"))
+    val query = sourceData.filter("A = 1")
+    val plan = query.queryExecution.optimizedPlan
+    assert(ApplyDataSkippingIndex.applyIndex(plan, Map.empty) === plan)
+  }
+
+  test("score returns 0 if no index is given.") {
+    val sourceData = createSourceData(spark.range(100).toDF("A"))
+    val query = sourceData.filter("A = 1")
+    val plan = query.queryExecution.optimizedPlan
+    assert(ApplyDataSkippingIndex.score(plan, Map.empty) === 0)
+  }
+
+  case class SourceData(df: () => DataFrame, description: String)
+
+  case class Param(
+      sourceData: SourceData,
+      filter: String,
+      sketches: Seq[Sketch],
+      numExpectedFiles: Int,
+      setup: Option[() => _])
+
+  object Param {
+    def apply(
+        sourceData: SourceData,
+        filter: String,
+        sketch: Sketch,
+        numExpectedFiles: Int): Param = {
+      Param(sourceData, filter, Seq(sketch), numExpectedFiles, None)
+    }
+
+    def apply(
+        sourceData: SourceData,
+        filter: String,
+        sketches: Seq[Sketch],
+        numExpectedFiles: Int): Param = {
+      Param(sourceData, filter, sketches, numExpectedFiles, None)
+    }
+
+    def apply(
+        sourceData: SourceData,
+        filter: String,
+        sketch: Sketch,
+        numExpectedFiles: Int,
+        setup: () => _): Param = {
+      Param(sourceData, filter, Seq(sketch), numExpectedFiles, Some(setup))
+    }
+
+    def apply(
+        sourceData: SourceData,
+        filter: String,
+        sketches: Seq[Sketch],
+        numExpectedFiles: Int,
+        setup: () => _): Param = {
+      Param(sourceData, filter, sketches, numExpectedFiles, Some(setup))
+    }
+  }
+
+  def dataI: SourceData =
+    SourceData(() => createSourceData(spark.range(100).toDF("A")), "source [A:Int]")
+
+  def dataII: SourceData =
+    SourceData(
+      () => createSourceData(spark.range(100).selectExpr("id as A", "id * 2 as B")),
+      "source [A:Int, B:Int]")
+
+  def dataIN: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          Seq[Integer](1, 2, null, null, null, null, 7, 8, 9, null, 11, 12, null, 14, null, null,
+            17, null, 19, 20).toDF("A")),
+      "source [A:Int] with nulls")
+
+  def dataIIP: SourceData =
+    SourceData(
+      () =>
+        createPartitionedSourceData(
+          spark.range(100).selectExpr("cast(id / 10 as int) as A", "id as B"),
+          Seq("A")),
+      "source [A:Int, B:Int] partitioned")
+
+  def dataD: SourceData =
+    SourceData(
+      () => createSourceData(spark.range(100).map(_.toDouble).toDF("A")),
+      "source [A:Double]")
+
+  def dataDS: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          Seq(
+            0.0,
+            1.0,
+            1.5,
+            Double.NegativeInfinity,
+            Double.PositiveInfinity,
+            Double.NaN,
+            3.14,
+            2.718,
+            -1.1,
+            -0.0).toDF("A")),
+      "source [A:Double] small")
+
+  def dataN2: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          spark.read.json(Seq(
+            """{"a": 1, "b": {"a": 0, "c": 2, "d": "x"}}""",
+            """{"a": 2, "b": {"a": 0, "c": 3, "d": "y"}}""",
+            """{"a": 3, "b": {"a": 1, "c": 4, "d": "x"}}""",
+            """{"a": 4, "b": {"a": 2, "c": null, "d": "x"}}""",
+            """{"a": 2, "b": {"a": 2, "c": 6, "d": "x"}}""",
+            """{"a": 2, "b": {"a": 1, "c": 7, "d": "x"}}""",
+            """{"b": {"c": 8, "d": "x"}}""",
+            """{"b": {"d": "y"}}""",
+            """{"a": 3}""",
+            """{"b": {"c": 11}}""").toDS)),
+      "source [A:Int, B:[A:Int, C:Int, D: String]]")
+
+  def dataN3: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          spark.read.json(Seq(
+            """{"a": {"b": {"c": 1}}}""",
+            """{"a": {"b": {"c": 2}}}""",
+            """{"a": {"b": {"c": 3}}}""",
+            """{"a": {"b": {"c": null}}}""",
+            """{"a": {"b": {"c": 5}}}""",
+            """{"a": {"b": {"c": 6}}}""",
+            """{"a": {"b": {"c": 7}}}""",
+            """{"a": {"b": {"c": 8}}}""",
+            """{"a": null}""",
+            """{"a": {"b": {"c": 0}}}""").toDS)),
+      "source [A:[B:[C:Int]]]")
+
+  def dataB: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          Seq(
+            Array[Byte](0, 0, 0, 0),
+            Array[Byte](0, 1, 0, 1),
+            Array[Byte](1, 2, 3, 4),
+            Array[Byte](5, 6, 7, 8),
+            Array[Byte](32, 32, 32, 32),
+            Array[Byte](64, 64, 64, 64),
+            Array[Byte](1, 1, 1, 1),
+            Array[Byte](-128, -128, -128, -128),
+            Array[Byte](127, 127, 127, 127),
+            Array[Byte](-1, 1, 0, 0)).toDF("A")),
+      "source [A:Binary]")
+
+  def dataS: SourceData =
+    SourceData(
+      () =>
+        createSourceData(
+          Seq(
+            "foo1",
+            "foo2000",
+            "foo3",
+            "foo4",
+            "foo5",
+            null,
+            "foo7",
+            "foo8",
+            "foo9",
+            "baar",
+            null)
+            .toDF("A")),
+      "source [A:String]")
+
+  Seq(
+    Param(dataI, "A = 10", MinMaxSketch("A"), 1),
+    Param(dataI, "50 = a", MinMaxSketch("A"), 1),
+    Param(dataI, "A = -10", MinMaxSketch("a"), 0),
+    Param(dataI, "A = 5 + 5", MinMaxSketch("A"), 1),
+    Param(dataI, "A = 10 or A = 30", MinMaxSketch("A"), 2),
+    Param(dataI, "A is null", MinMaxSketch("A"), 10),
+    Param(dataI, "!(A is null)", MinMaxSketch("A"), 10),
+    Param(dataI, "A is not null", MinMaxSketch("A"), 10),
+    Param(dataI, "!(A is not null)", MinMaxSketch("A"), 10),
+    Param(dataI, "A <=> 10", MinMaxSketch("A"), 1),
+    Param(dataI, "10 <=> A", MinMaxSketch("A"), 1),
+    Param(dataI, "A <=> null", MinMaxSketch("A"), 10),
+    Param(dataI, "A <25", MinMaxSketch("A"), 3),
+    Param(dataI, "30>A", MinMaxSketch("A"), 3),
+    Param(dataI, "31 > A", MinMaxSketch("a"), 4),
+    Param(dataI, "A > 25", MinMaxSketch("a"), 8),
+    Param(dataI, "28 < A", MinMaxSketch("a"), 8),
+    Param(dataI, "29< A", MinMaxSketch("A"), 7),
+    Param(dataI, "A <= 25", MinMaxSketch("A"), 3),
+    Param(dataI, "29 >= A", MinMaxSketch("A"), 3),
+    Param(dataI, "30>=A", MinMaxSketch("A"), 4),
+    Param(dataI, "A >= 25", MinMaxSketch("A"), 8),
+    Param(dataI, "29 <= A", MinMaxSketch("A"), 8),
+    Param(dataI, "30 <= A", MinMaxSketch("A"), 7),
+    Param(dataI, "A != 1", MinMaxSketch("A"), 10),
+    Param(dataI, "not (A != 1 and A != 10)", MinMaxSketch("A"), 2),
+    Param(dataI, "!(!(A = 1))", MinMaxSketch("A"), 1),
+    Param(dataI, "!(A < 20)", MinMaxSketch("A"), 8),
+    Param(dataI, "not (A not in (1, 2, 3))", MinMaxSketch("A"), 1),
+    Param(dataS, "A < 'foo'", MinMaxSketch("A"), 1),
+    Param(dataI, "a = 10", MinMaxSketch("A"), 1),
+    Param(dataI, "A = 10", MinMaxSketch("a"), 1),
+    Param(dataI, "A in (1, 2, 3, null, 10)", MinMaxSketch("A"), 2),
+    Param(dataI, "A in (10,9,8,7,6,5,4,3,2,1,50,49,48,47,46,45)", MinMaxSketch("A"), 4),
+    Param(dataS, "A in ('foo1', 'foo5', 'foo9')", MinMaxSketch("A"), 3),
+    Param(
+      dataS,
+      "A in ('foo1','a','b','c','d','e','f','g','h','i','j','k')",
+      MinMaxSketch("A"),
+      1),
+    Param(dataD, "A in (1,2,3,15,16,17)", MinMaxSketch("A"), 2),
+    Param(dataD, "A in (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)", MinMaxSketch("A"), 2),
+    Param(dataB, "A in (x'00000000', x'0001', x'0002', x'05060708')", MinMaxSketch("A"), 2),
+    Param(
+      dataB,
+      "A in (x'00',x'01',x'02',x'03',x'04',x'05',x'06',x'07',x'08',x'09',x'0a',x'20202020')",
+      MinMaxSketch("A"),
+      1),
+    Param(dataI, "A BETWEEN 27 AND 51", MinMaxSketch("A"), 4),
+    Param(dataI, "IF(A=1,2,3)=2", MinMaxSketch("A"), 10),
+    Param(dataII, "A = 10 OR B = 50", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 2),
+    Param(dataII, "A = 10 or B = 50", Seq(MinMaxSketch("A")), 10),
+    Param(dataII, "B = 50 or A = 10", Seq(MinMaxSketch("A")), 10),
+    Param(dataII, "A = 10 and B = 20", MinMaxSketch("A"), 1),
+    Param(dataII, "a = 10 AND b = 20", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 1),
+    Param(dataII, "A < 30 and B > 20", MinMaxSketch("A"), 3),
+    Param(dataII, "A < 30 and b > 40", Seq(MinMaxSketch("a"), MinMaxSketch("B")), 1),
+    Param(dataII, "A = 10 and B = 90", Seq(MinMaxSketch("A"), MinMaxSketch("B")), 0),
+    Param(dataIN, "A is not null", MinMaxSketch("A"), 7),
+    Param(dataIN, "!(A <=> null)", MinMaxSketch("A"), 7),
+    Param(dataIN, "A = 2", MinMaxSketch("A"), 1),
+    Param(dataIN, "A is null", MinMaxSketch("A"), 10),
+    Param(dataIIP, "B = 10", MinMaxSketch("B"), 1),
+    Param(dataIIP, "A = 5 and B = 20", MinMaxSketch("B"), 0),
+    Param(dataIIP, "A < 5 and B = 20", MinMaxSketch("B"), 1),
+    Param(dataN2, "B.C = 2", MinMaxSketch("B.C"), 1),
+    Param(dataN2, "B.c = 2", MinMaxSketch("b.C"), 1),
+    Param(dataN2, "b.c < 5", MinMaxSketch("b.c"), 3),
+    Param(dataN3, "A.B.C = 2", MinMaxSketch("a.B.C"), 1),
+    Param(dataDS, "A = 1.0", MinMaxSketch("A"), 1),
+    Param(dataDS, "A <= 1.5", MinMaxSketch("A"), 6),
+    Param(dataDS, "A >= 1.5", MinMaxSketch("A"), 5),
+    Param(dataD, "A in (1, 2, 3, 10)", MinMaxSketch("A"), 2),
+    Param(dataII, "A + B < 100", MinMaxSketch("a+b"), 4),
+    Param(
+      dataI,
+      "F(A) = 10",
+      MinMaxSketch("F(A)"),
+      1,
+      () => spark.udf.register("F", (a: Int) => a * 2)),
+    Param(
+      dataI,
+      "is_less_than_23(A)",
+      MinMaxSketch("is_less_than_23(A)"),
+      3,
+      () => spark.udf.register("is_less_than_23", (a: Int) => a < 23)),
+    Param(
+      dataI,
+      "!is_less_than_23(A)",
+      MinMaxSketch("is_less_than_23(A)"),
+      8,
+      () => spark.udf.register("is_less_than_23", (a: Int) => a < 23)),
+    Param(
+      dataII,
+      "A < 50 and F(A,B) < 20",
+      Seq(MinMaxSketch("A"), MinMaxSketch("F(A,B)")),
+      2,
+      () => spark.udf.register("F", (a: Int, b: Int) => b - a)),
+    Param(
+      dataI,
+      "f(a) < 30",
+      MinMaxSketch("F(a)"),
+      2,
+      () => spark.udf.register("F", (a: Int) => a * 2)),
+    Param(
+      dataI,
+      "IF(A IS NULL,NULL,F(A))=2",
+      MinMaxSketch("A"),
+      10,
+      () => spark.udf.register("F", (a: Int) => a * 2))).foreach {
+    case Param(sourceData, filter, sketches, numExpectedFiles, setup) =>
+      test(
+        s"applyIndex works as expected for ${sourceData.description}: " +
+          s"filter=[$filter], sketches=[${sketches.mkString(", ")}], " +
+          s"numExpectedFiles=[$numExpectedFiles]") {
+        val indexConfig = DataSkippingIndexConfig("ind1", sketches.head, sketches.tail: _*)
+        if (setup.nonEmpty) {
+          setup.get.apply()
+        }
+        testApplyIndex(sourceData.df(), filter, indexConfig, numExpectedFiles)
+      }
+  }
+
+  def testApplyIndex(
+      sourceData: DataFrame,
+      filter: String,
+      indexConfig: DataSkippingIndexConfig,
+      numExpectedFiles: Int): Unit = {
+    val originalNumFiles = listFiles(dataPath()).filter(isParquet).length
+    val query = sourceData.filter(filter)
+    val plan = query.queryExecution.optimizedPlan
+    val indexLogEntry = createIndexLogEntry(indexConfig, sourceData)
+    val indexDataPred = indexLogEntry.derivedDataset
+      .asInstanceOf[DataSkippingIndex]
+      .translateFilterCondition(
+        spark,
+        plan.asInstanceOf[Filter].condition,
+        sourceData.queryExecution.optimizedPlan)
+    indexLogEntry.setTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE, indexDataPred)
+    val optimizedPlan = ApplyDataSkippingIndex.applyIndex(
+      plan,
+      Map(sourceData.queryExecution.optimizedPlan -> indexLogEntry))
+    if (indexDataPred.isEmpty) {
+      assert(optimizedPlan === plan)
+    } else {
+      assert(optimizedPlan !== plan)
+      optimizedPlan match {
+        case Filter(
+              _,
+              LogicalRelation(
+                HadoopFsRelation(location: DataSkippingFileIndex, _, _, _, _, _),
+                _,
+                _,
+                _)) =>
+          assert(location.indexDataPred === indexDataPred.get)
+        case _ => fail(s"unexpected optimizedPlan: $optimizedPlan")
+      }
+    }
+    val optimizedDf = logicalPlanToDataFrame(spark, optimizedPlan)
+    checkAnswer(optimizedDf, query)
+    assert(numAccessedFiles(optimizedDf) === numExpectedFiles)
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilterTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterConditionFilterTest.scala
@ -0,0 +1,83 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+
+import com.microsoft.hyperspace.index.IndexLogEntryTags
+import com.microsoft.hyperspace.index.dataskipping._
+import com.microsoft.hyperspace.index.dataskipping.sketches._
+import com.microsoft.hyperspace.index.plananalysis.FilterReasons.IneligibleFilterCondition
+
+class FilterConditionFilterTest extends DataSkippingSuite {
+  test("apply returns an empty map if there are no candidate indexes.") {
+    val df = spark.range(10).toDF("A")
+    assert(FilterConditionFilter(df.queryExecution.optimizedPlan, Map.empty) === Map.empty)
+  }
+
+  test("apply returns an empty map if the plan is not a filter.") {
+    val df = createSourceData(spark.range(10).toDF("A"))
+    val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexLogEntry = createIndexLogEntry(indexConfig, df)
+    val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry))
+    assert(FilterConditionFilter(df.queryExecution.optimizedPlan, candidateIndexes) === Map.empty)
+  }
+
+  test("apply creates an index data predicate if the index can be applied to the plan.") {
+    val df = createSourceData(spark.range(10).toDF("A"))
+    val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexLogEntry = createIndexLogEntry(indexConfig, df)
+    val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry))
+    val plan = df.filter("A = 1").queryExecution.optimizedPlan
+    assert(FilterConditionFilter(plan, candidateIndexes) === candidateIndexes)
+    val indexDataPredOpt =
+      indexLogEntry.getTagValue(plan, IndexLogEntryTags.DATASKIPPING_INDEX_PREDICATE)
+    assert(
+      indexDataPredOpt === Some(Some(And(
+        IsNotNull(UnresolvedAttribute("MinMax_A__0")),
+        And(
+          LessThanOrEqual(UnresolvedAttribute("MinMax_A__0"), Literal(1L)),
+          GreaterThanOrEqual(UnresolvedAttribute("MinMax_A__1"), Literal(1L)))))))
+  }
+
+  test("apply returns an empty map if the filter condition is not suitable.") {
+    val df = createSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B"))
+    val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexLogEntry = createIndexLogEntry(indexConfig, df)
+    indexLogEntry.setTagValue(IndexLogEntryTags.INDEX_PLAN_ANALYSIS_ENABLED, true)
+    val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry))
+    val plan = df.filter("B = 1").queryExecution.optimizedPlan
+    assert(FilterConditionFilter(plan, candidateIndexes) === Map.empty)
+    val reason = indexLogEntry.getTagValue(plan, IndexLogEntryTags.FILTER_REASONS)
+    assert(reason === Some(List(IneligibleFilterCondition("((`B` IS NOT NULL) AND (`B` = 1L))"))))
+  }
+
+  test("apply returns only the applicable indexes when there are multiple candidate indexes.") {
+    val df = createSourceData(spark.range(10).selectExpr("id as A", "id * 2 as B"))
+    val indexConfig1 = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexConfig2 = DataSkippingIndexConfig("myind", MinMaxSketch("B"))
+    val indexLogEntry1 = createIndexLogEntry(indexConfig1, df)
+    val indexLogEntry2 = createIndexLogEntry(indexConfig2, df)
+    val candidateIndexes =
+      Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry1, indexLogEntry2))
+    val plan = df.filter("A = 1").queryExecution.optimizedPlan
+    assert(
+      FilterConditionFilter(plan, candidateIndexes) === Map(
+        df.queryExecution.optimizedPlan -> Seq(indexLogEntry1)))
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilterTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/rules/FilterPlanNodeFilterTest.scala
@ -0,0 +1,51 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.rules
+
+import com.microsoft.hyperspace.index.dataskipping._
+import com.microsoft.hyperspace.index.dataskipping.sketches._
+
+class FilterPlanNodeFilterTest extends DataSkippingSuite {
+  test("apply returns an empty map if there are no candidate indexes.") {
+    val df = spark.range(10).toDF("A")
+    assert(FilterPlanNodeFilter(df.queryExecution.optimizedPlan, Map.empty) === Map.empty)
+  }
+
+  test("apply returns an empty map if the plan is not a filter.") {
+    val df = createSourceData(spark.range(10).toDF("A"))
+    val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexLogEntry = createIndexLogEntry(indexConfig, df)
+    val candidateIndexes = Map(df.queryExecution.optimizedPlan -> Seq(indexLogEntry))
+    val plan = df.groupBy("A").count().queryExecution.optimizedPlan
+    assert(FilterPlanNodeFilter(plan, candidateIndexes) === Map.empty)
+  }
+
+  test("apply returns applicable indexes only.") {
+    val df1 = createSourceData(spark.range(10).toDF("A"), "T1")
+    val df2 = createSourceData(spark.range(10).toDF("A"), "T2")
+    val indexConfig = DataSkippingIndexConfig("myind", MinMaxSketch("A"))
+    val indexLogEntry1 = createIndexLogEntry(indexConfig, df1)
+    val indexLogEntry2 = createIndexLogEntry(indexConfig, df2)
+    val candidateIndexes = Map(
+      df1.queryExecution.optimizedPlan -> Seq(indexLogEntry1),
+      df2.queryExecution.optimizedPlan -> Seq(indexLogEntry2))
+    val plan = df1.filter("A = 1").queryExecution.optimizedPlan
+    assert(
+      FilterPlanNodeFilter(plan, candidateIndexes) === Map(
+        df1.queryExecution.optimizedPlan -> Seq(indexLogEntry1)))
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketchTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketch/MinMaxSketchTest.scala
@ -1,66 +0,0 @@
-/*
- * Copyright (2021) The Hyperspace Project Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.microsoft.hyperspace.index.dataskipping.sketch
-
-import org.apache.spark.sql.{Column, QueryTest}
-import org.mockito.Mockito.mock
-
-import com.microsoft.hyperspace.index.HyperspaceSuite
-
-class MinMaxSketchTest extends QueryTest with HyperspaceSuite {
-  import spark.implicits._
-
-  test("indexedColumns returns the indexed column.") {
-    val sketch = MinMaxSketch("A")
-    assert(sketch.indexedColumns === Seq("A"))
-  }
-
-  test("referencedColumns returns the indexed column.") {
-    val sketch = MinMaxSketch("A")
-    assert(sketch.referencedColumns === Seq("A"))
-  }
-
-  test("aggregateFunctions returns min and max aggregation functions.") {
-    val sketch = MinMaxSketch("A")
-    val aggrs = sketch.aggregateFunctions.map(new Column(_))
-    val data = Seq(1, -1, 10, 2, 4).toDF("A")
-    checkAnswer(data.select(aggrs: _*), Seq((-1, 10)).toDF)
-  }
-
-  test("toString returns a reasonable string.") {
-    val sketch = MinMaxSketch("A")
-    assert(sketch.toString === "MinMax(A)")
-  }
-
-  test("Two sketches are equal if their columns are equal.") {
-    assert(MinMaxSketch("A") === MinMaxSketch("A"))
-    assert(MinMaxSketch("A") !== MinMaxSketch("a"))
-    assert(MinMaxSketch("b") !== MinMaxSketch("B"))
-    assert(MinMaxSketch("B") === MinMaxSketch("B"))
-  }
-
-  test("MinMaxSketch is different from other sketches.") {
-    val s1 = MinMaxSketch("A")
-    val s2 = mock(classOf[Sketch])
-    assert(s1 !== s2)
-  }
-
-  test("hashCode is reasonably implemented.") {
-    assert(MinMaxSketch("A").hashCode === MinMaxSketch("A").hashCode)
-    assert(MinMaxSketch("A").hashCode !== MinMaxSketch("a").hashCode)
-  }
-}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketchTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/MinMaxSketchTest.scala
@ -0,0 +1,338 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.sketches
+
+import org.apache.spark.sql.{Column, QueryTest}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+import org.mockito.Mockito.mock
+
+import com.microsoft.hyperspace.index.HyperspaceSuite
+import com.microsoft.hyperspace.index.dataskipping.expressions._
+
+class MinMaxSketchTest extends QueryTest with HyperspaceSuite {
+  import spark.implicits._
+
+  val valueExtractor = AttrValueExtractor(Map.empty)
+
+  test("indexedColumns returns the indexed column.") {
+    val sketch = MinMaxSketch("A")
+    assert(sketch.indexedColumns === Seq("A"))
+  }
+
+  test("referencedColumns returns the indexed column.") {
+    val sketch = MinMaxSketch("A")
+    assert(sketch.referencedColumns === Seq("A"))
+  }
+
+  test("aggregateFunctions returns min and max aggregation functions.") {
+    val sketch = MinMaxSketch("A")
+    val aggrs = sketch.aggregateFunctions.map(new Column(_))
+    val data = Seq(1, -1, 10, 2, 4).toDF("A")
+    checkAnswer(data.select(aggrs: _*), Seq((-1, 10)).toDF)
+  }
+
+  test("toString returns a reasonable string.") {
+    val sketch = MinMaxSketch("A")
+    assert(sketch.toString === "MinMax(A)")
+  }
+
+  test("Two sketches are equal if their columns are equal.") {
+    assert(MinMaxSketch("A") === MinMaxSketch("A"))
+    assert(MinMaxSketch("A") !== MinMaxSketch("a"))
+    assert(MinMaxSketch("b") !== MinMaxSketch("B"))
+    assert(MinMaxSketch("B") === MinMaxSketch("B"))
+  }
+
+  test("MinMaxSketch is different from other sketches.") {
+    val s1 = MinMaxSketch("A")
+    val s2 = mock(classOf[Sketch])
+    assert(s1 !== s2)
+  }
+
+  test("hashCode is reasonably implemented.") {
+    assert(MinMaxSketch("A").hashCode === MinMaxSketch("A").hashCode)
+    assert(MinMaxSketch("A").hashCode !== MinMaxSketch("a").hashCode)
+  }
+
+  test("convertPredicate converts EqualTo(<col>, <lit>).") {
+    val sketch = MinMaxSketch("A")
+    val predicate = EqualTo(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal(42)),
+        GreaterThanOrEqual(sketchValues(1), Literal(42))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts EqualTo(<lit>, <col>).") {
+    val sketch = MinMaxSketch("A")
+    val predicate = EqualTo(Literal(42), AttributeReference("A", IntegerType)(ExprId(0)))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal(42)),
+        GreaterThanOrEqual(sketchValues(1), Literal(42))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts EqualTo(<struct field access>, <lit>).") {
+    val sketch = MinMaxSketch("A.C")
+    val structAccess = GetStructField(
+      AttributeReference("A", StructType(Seq(StructField("C", IntegerType))))(ExprId(0)),
+      0)
+    val predicate = EqualTo(structAccess, Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(structAccess.transformUp {
+        case attr: AttributeReference => attr.withExprId(ExpressionUtils.nullExprId)
+      }),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal(42)),
+        GreaterThanOrEqual(sketchValues(1), Literal(42))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts EqualTo(<nested struct field access>, <lit>).") {
+    val sketch = MinMaxSketch("A.B.C")
+    val structAccess = GetStructField(
+      GetStructField(
+        AttributeReference(
+          "A",
+          StructType(Seq(StructField("B", StructType(Seq(StructField("C", IntegerType)))))))(
+          ExprId(0)),
+        0),
+      0)
+    val predicate = EqualTo(structAccess, Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(structAccess.transformUp {
+        case attr: AttributeReference => attr.withExprId(ExpressionUtils.nullExprId)
+      }),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal(42)),
+        GreaterThanOrEqual(sketchValues(1), Literal(42))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts EqualTo(<col>, <lit>) - string type.") {
+    val sketch = MinMaxSketch("A")
+    val predicate =
+      EqualTo(AttributeReference("A", StringType)(ExprId(0)), Literal.create("hello", StringType))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal.create("hello", StringType)),
+        GreaterThanOrEqual(sketchValues(1), Literal.create("hello", StringType))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts EqualTo(<col>, <lit>) - double type.") {
+    val sketch = MinMaxSketch("A")
+    val predicate =
+      EqualTo(AttributeReference("A", StringType)(ExprId(0)), Literal(3.14, DoubleType))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      And(
+        LessThanOrEqual(sketchValues(0), Literal(3.14, DoubleType)),
+        GreaterThanOrEqual(sketchValues(1), Literal(3.14, DoubleType))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts LessThan.") {
+    val sketch = MinMaxSketch("A")
+    val predicate = LessThan(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(LessThan(sketchValues(0), Literal(42)))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts LessThan - string type.") {
+    val sketch = MinMaxSketch("A")
+    val predicate = LessThan(
+      AttributeReference("A", StringType)(ExprId(0)),
+      Literal.create("hello", StringType))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(LessThan(sketchValues(0), Literal.create("hello", StringType)))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts LessThanOrEqual.") {
+    val sketch = MinMaxSketch("A")
+    val predicate = LessThanOrEqual(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(LessThanOrEqual(sketchValues(0), Literal(42)))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts GreaterThan.") {
+    val sketch = MinMaxSketch("A")
+    val predicate = GreaterThan(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(GreaterThan(sketchValues(1), Literal(42)))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts GreaterThanOrEqual.") {
+    val sketch = MinMaxSketch("A")
+    val predicate =
+      GreaterThanOrEqual(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(GreaterThanOrEqual(sketchValues(1), Literal(42)))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts In.") {
+    val sketch = MinMaxSketch("A")
+    val predicate =
+      In(AttributeReference("A", IntegerType)(ExprId(0)), Seq(Literal(42), Literal(23)))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      Or(
+        And(
+          LessThanOrEqual(sketchValues(0), Literal(42)),
+          GreaterThanOrEqual(sketchValues(1), Literal(42))),
+        And(
+          LessThanOrEqual(sketchValues(0), Literal(23)),
+          GreaterThanOrEqual(sketchValues(1), Literal(23)))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate converts In - string type.") {
+    val sketch = MinMaxSketch("A")
+    val predicate =
+      In(
+        AttributeReference("A", StringType)(ExprId(0)),
+        Seq(Literal.create("hello", StringType), Literal.create("world", StringType)))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", StringType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = Some(
+      Or(
+        And(
+          LessThanOrEqual(sketchValues(0), Literal.create("hello", StringType)),
+          GreaterThanOrEqual(sketchValues(1), Literal.create("hello", StringType))),
+        And(
+          LessThanOrEqual(sketchValues(0), Literal.create("world", StringType)),
+          GreaterThanOrEqual(sketchValues(1), Literal.create("world", StringType)))))
+    assert(result === expected)
+  }
+
+  test("convertPredicate does not convert Not(EqualTo(<col>, <lit>)).") {
+    val sketch = MinMaxSketch("A")
+    val predicate = Not(EqualTo(AttributeReference("A", IntegerType)(ExprId(0)), Literal(42)))
+    val sketchValues = Seq(UnresolvedAttribute("min"), UnresolvedAttribute("max"))
+    val nameMap = Map(ExprId(0) -> "A")
+    val result = sketch.convertPredicate(
+      predicate,
+      Seq(AttributeReference("A", IntegerType)(ExpressionUtils.nullExprId)),
+      sketchValues,
+      nameMap,
+      valueExtractor)
+    val expected = None
+    assert(result === expected)
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketchTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/sketches/PartitionSketchTest.scala
@ -0,0 +1,166 @@
+/*
+ * Copyright (2021) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.dataskipping.sketches
+
+import org.apache.spark.sql.{Column, QueryTest}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.types._
+
+import com.microsoft.hyperspace.index.HyperspaceSuite
+import com.microsoft.hyperspace.index.dataskipping.expressions._
+
+class PartitionSketchTest extends QueryTest with HyperspaceSuite {
+  import spark.implicits._
+
+  test("indexedColumns returns the indexed column.") {
+    val sketch = PartitionSketch(Seq(("A", None)))
+    assert(sketch.indexedColumns === Seq("A"))
+  }
+
+  test("referencedColumns returns the indexed column.") {
+    val sketch = PartitionSketch(Seq(("a", None), ("b", None)))
+    assert(sketch.referencedColumns === Seq("a", "b"))
+  }
+
+  test("aggregateFunctions returns first aggregation function.") {
+    val sketch = PartitionSketch(Seq(("A", None)))
+    val aggrs = sketch.aggregateFunctions.map(new Column(_))
+    val data = Seq(1, 1, 1, 1, 1).toDF("A")
+    checkAnswer(data.select(aggrs: _*), Seq(1).toDF)
+  }
+
+  test("toString returns a human-readable string representation.") {
+    val sketch = PartitionSketch(Seq(("A", None), ("B", None)))
+    assert(sketch.toString === "Partition(A, B)")
+  }
+
+  test("Two sketches are equal if their columns are equal.") {
+    assert(PartitionSketch(Seq(("A", None))) === PartitionSketch(Seq(("A", None))))
+    assert(
+      PartitionSketch(Seq(("A", Some(LongType)))) !==
+        PartitionSketch(Seq(("A", Some(IntegerType)))))
+  }
+
+  test("hashCode is reasonably implemented.") {
+    assert(
+      PartitionSketch(Seq(("A", None))).hashCode === PartitionSketch(Seq(("A", None))).hashCode)
+    assert(
+      PartitionSketch(Seq(("A", Some(LongType)))).hashCode !==
+        PartitionSketch(Seq(("A", Some(IntegerType)))).hashCode)
+  }
+
+  val a = AttributeReference("A", IntegerType)(ExprId(0))
+  val b = AttributeReference("B", IntegerType)(ExprId(1))
+  val sketchA = UnresolvedAttribute("Partition_A")
+  val sketchB = UnresolvedAttribute("Partition_B")
+
+  {
+    val sketch = PartitionSketch(Seq(("A", Some(IntegerType))))
+    val resolvedExprs = Seq(ExpressionUtils.normalize(a))
+    val sketchValues = Seq(sketchA)
+    val valueExtractor = AttrValueExtractor(Map(a -> sketchA))
+
+    test("convertPredicate converts EqualTo(<col>, <lit>).") {
+      assert(
+        sketch.convertPredicate(
+          EqualTo(a, Literal(42)),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === Some(EqualTo(sketchA, Literal(42))))
+    }
+
+    test("convertPredicate does not convert non-deterministic expression.") {
+      assert(
+        sketch.convertPredicate(
+          EqualTo(a, Rand(42)),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+
+    test("convertPredicate does not convert literal.") {
+      assert(
+        sketch.convertPredicate(
+          Literal(42),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+
+    test("convertPredicate does not convert conjunction.") {
+      assert(
+        sketch.convertPredicate(
+          And(LessThan(a, Literal(42)), GreaterThan(a, Literal(23))),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+
+    test("convertPredicate does not convert disjunction.") {
+      assert(
+        sketch.convertPredicate(
+          Or(LessThan(a, Literal(42)), GreaterThan(a, Literal(23))),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+
+    test("convertPredicate does not convert predicate having subquery.") {
+      assert(
+        sketch.convertPredicate(
+          InSubquery(Seq(a), ListQuery(LocalRelation(a))),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+
+    test("convertPredicate does not convert predicate having unknown attr.") {
+      assert(
+        sketch.convertPredicate(
+          EqualTo(a, b),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === None)
+    }
+  }
+
+  {
+    val sketch = PartitionSketch(Seq(("A", Some(IntegerType)), ("B", Some(IntegerType))))
+    val resolvedExprs = Seq(a, b).map(ExpressionUtils.normalize)
+    val sketchValues = Seq(sketchA, sketchB)
+    val valueExtractor = AttrValueExtractor(Map(a -> sketchA, b -> sketchB))
+
+    test("convertPredicate converts EqualTo(<col>, <col>).") {
+      assert(
+        sketch.convertPredicate(
+          EqualTo(a, b),
+          resolvedExprs,
+          sketchValues,
+          Map.empty,
+          valueExtractor) === Some(EqualTo(sketchA, sketchB)))
+    }
+  }
+}
--- a/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExtractorsTest.scala
+++ b/src/test/scala/com/microsoft/hyperspace/index/dataskipping/util/ExtractorsTest.scala
@ -1,946 +0,0 @@
-/*
- * Copyright (2021) The Hyperspace Project Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.microsoft.hyperspace.index.dataskipping.util
-
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{BooleanType, IntegerType}
-import org.mockito.Mockito.{mock, when}
-
-import com.microsoft.hyperspace.index.HyperspaceSuite
-
-class ExtractorsTest extends HyperspaceSuite {
-  val a = AttributeReference("A", IntegerType)()
-  val b = AttributeReference("B", BooleanType)()
-  val zero = Literal(0, IntegerType)
-  val one = Literal(1, IntegerType)
-  val two = Literal(2, IntegerType)
-  val trueLit = Literal(true, BooleanType)
-  val falseLit = Literal(false, BooleanType)
-  val nullInt = Literal(null, IntegerType)
-  val nullBool = Literal(null, BooleanType)
-
-  val aMatcher = {
-    val matcher = mock(classOf[ExprMatcher])
-    when(matcher.apply(a)).thenReturn(true)
-    matcher
-  }
-  val bMatcher = {
-    val matcher = mock(classOf[ExprMatcher])
-    when(matcher.apply(b)).thenReturn(true)
-    matcher
-  }
-  val nonMatcher = mock(classOf[ExprMatcher])
-
-  // EqualToExtractor
-  {
-    val AEqualTo = EqualToExtractor(aMatcher)
-    val NoneEqualTo = EqualToExtractor(nonMatcher)
-
-    test("EqualToExtractor matches EqualTo(<expr>, <lit>) if matcher(<expr>).") {
-      val value = EqualTo(a, zero) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("EqualToExtractor does not match EqualTo(<expr>, null).") {
-      val value = EqualTo(a, nullInt) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match EqualTo(<expr>, <expr>).") {
-      val value = EqualTo(a, a) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match EqualTo(<expr>, <lit>) if !matcher(<expr>).") {
-      val value = EqualTo(a, zero) match {
-        case NoneEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor matches EqualTo(<lit>, <expr>) if matcher(<expr>).") {
-      val value = EqualTo(zero, a) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("EqualToExtractor does not match EqualTo(null, <expr>).") {
-      val value = EqualTo(nullInt, a) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match EqualTo(<lit>, <expr>) if !matcher(<expr>).") {
-      val value = EqualTo(zero, a) match {
-        case NoneEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor matches EqualNullSafe(<expr>, <lit>) if matcher(<expr>).") {
-      val value = EqualNullSafe(a, zero) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("EqualToExtractor does not match EqualNullSafe(<expr>, null).") {
-      val value = EqualNullSafe(a, nullInt) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match EqualNullSafe(<expr>, <lit>) if !matcher(<expr>).") {
-      val value = EqualNullSafe(a, zero) match {
-        case NoneEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor matches EqualNullSafe(<lit>, <expr>) if matcher(<expr>).") {
-      val value = EqualNullSafe(zero, a) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("EqualToExtractor does not match EqualNullSafe(null, <expr>).") {
-      val value = EqualNullSafe(nullInt, a) match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match EqualNullSafe(<lit>, <expr>) if !matcher(<expr>).") {
-      val value = EqualNullSafe(zero, a) match {
-        case NoneEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("EqualToExtractor does not match expressions other than EqualTo/EqualNullSafe.") {
-      val value = zero match {
-        case AEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // LessThanExtractor
-  {
-    val ALessThan = LessThanExtractor(aMatcher)
-    val NoneLessThan = LessThanExtractor(nonMatcher)
-
-    test("LessThanExtractor matches LessThan(<expr>, <lit>) if matcher(<expr>).") {
-      val value = LessThan(a, zero) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("LessThanExtractor does not match LessThan(<expr>, null).") {
-      val value = LessThan(a, nullInt) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match LessThan(<expr>, <expr>).") {
-      val value = LessThan(a, a) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match LessThan(<expr>, <lit>) if !matcher(<expr>).") {
-      val value = LessThan(a, zero) match {
-        case NoneLessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match LessThan(<lit>, <expr>).") {
-      val value = LessThan(zero, a) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor matches GreaterThan(<lit>, <expr>) if matcher(<expr>).") {
-      val value = GreaterThan(zero, a) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("LessThanExtractor does not match GreaterThan(null, <expr>).") {
-      val value = GreaterThan(nullInt, a) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match GreaterThan(<lit>, <expr>) if !matcher(<expr>).") {
-      val value = GreaterThan(zero, a) match {
-        case NoneLessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match GreaterThan(<expr>, <lit>).") {
-      val value = GreaterThan(a, zero) match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanExtractor does not match expressions other than LessThan.") {
-      val value = zero match {
-        case ALessThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // LessThanOrEqualToExtractor
-  {
-    val ALessThanOrEqualTo = LessThanOrEqualToExtractor(aMatcher)
-    val NoneLessThanOrEqualTo = LessThanOrEqualToExtractor(nonMatcher)
-
-    test(
-      "LessThanOrEqualToExtractor matches LessThanOrEqual(<expr>, <lit>) if matcher(<expr>).") {
-      val value = LessThanOrEqual(a, zero) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("LessThanOrEqualToExtractor does not match LessThanOrEqual(<expr>, null).") {
-      val value = LessThanOrEqual(a, nullInt) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanOrEqualToExtractor does not match LessThanOrEqual(<expr>, <expr>).") {
-      val value = LessThanOrEqual(a, a) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "LessThanOrEqualToExtractor does not match LessThanOrEqual(<expr>, <lit>) " +
-        "if !matcher(<expr>).") {
-      val value = LessThanOrEqual(a, zero) match {
-        case NoneLessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanOrEqualToExtractor does not match LessThanOrEqual(<lit>, <expr>).") {
-      val value = LessThanOrEqual(zero, a) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "LessThanOrEqualToExtractor matches GreaterThanOrEqual(<lit>, <expr>) if matcher(<expr>).") {
-      val value = GreaterThanOrEqual(zero, a) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("LessThanOrEqualToExtractor does not match GreaterThanOrEqual(null, <expr>).") {
-      val value = GreaterThanOrEqual(nullInt, a) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "LessThanOrEqualToExtractor does not match GreaterThanOrEqual(<lit>, <expr>) " +
-        "if !matcher(<expr>).") {
-      val value = GreaterThanOrEqual(zero, a) match {
-        case NoneLessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanOrEqualToExtractor does not match GreaterThanOrEqual(<expr>, <lit>).") {
-      val value = GreaterThanOrEqual(a, zero) match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("LessThanOrEqualToExtractor does not match expressions other than LessThanOrEqual.") {
-      val value = zero match {
-        case ALessThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // GreaterThanExtractor
-  {
-    val AGreaterThan = GreaterThanExtractor(aMatcher)
-    val NoneGreaterThan = GreaterThanExtractor(nonMatcher)
-
-    test("GreaterThanExtractor matches GreaterThan(<expr>, <lit>) if matcher(<expr>).") {
-      val value = GreaterThan(a, zero) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("GreaterThanExtractor does not match GreaterThan(<expr>, null).") {
-      val value = GreaterThan(a, nullInt) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match GreaterThan(<expr>, <expr>).") {
-      val value = GreaterThan(a, a) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match GreaterThan(<expr>, <lit>) if !matcher(<expr>).") {
-      val value = GreaterThan(a, zero) match {
-        case NoneGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match GreaterThan(<lit>, <expr>).") {
-      val value = GreaterThan(zero, a) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor matches LessThan(<lit>, <expr>) if matcher(<expr>).") {
-      val value = LessThan(zero, a) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("GreaterThanExtractor does not match LessThan(null, <expr>).") {
-      val value = LessThan(nullInt, a) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match LessThan(<lit>, <expr>) if !matcher(<expr>).") {
-      val value = LessThan(zero, a) match {
-        case NoneGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match LessThan(<expr>, <lit>).") {
-      val value = LessThan(a, zero) match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanExtractor does not match expressions other than GreaterThan.") {
-      val value = zero match {
-        case AGreaterThan(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // GreaterThanOrEqualToExtractor
-  {
-    val AGreaterThanOrEqualTo = GreaterThanOrEqualToExtractor(aMatcher)
-    val NoneGreaterThanOrEqualTo = GreaterThanOrEqualToExtractor(nonMatcher)
-
-    test(
-      "GreaterThanOrEqualToExtractor matches GreaterThanOrEqual(<expr>, <lit>) " +
-        "if matcher(<expr>).") {
-      val value = GreaterThanOrEqual(a, zero) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(<expr>, null).") {
-      val value = GreaterThanOrEqual(a, nullInt) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(<expr>, <expr>).") {
-      val value = GreaterThanOrEqual(a, a) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(<expr>, <lit>) " +
-        "if !matcher(<expr>).") {
-      val value = GreaterThanOrEqual(a, zero) match {
-        case NoneGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanOrEqualToExtractor does not match GreaterThanOrEqual(<lit>, <expr>).") {
-      val value = GreaterThanOrEqual(zero, a) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "GreaterThanOrEqualToExtractor matches LessThanOrEqual(<lit>, <expr>) if matcher(<expr>).") {
-      val value = LessThanOrEqual(zero, a) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === Some(zero))
-    }
-
-    test("GreaterThanOrEqualToExtractor does not match LessThanOrEqual(null, <expr>).") {
-      val value = LessThanOrEqual(nullInt, a) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "GreaterThanOrEqualToExtractor does not match LessThanOrEqual(<lit>, <expr>) " +
-        "if !matcher(<expr>).") {
-      val value = LessThanOrEqual(zero, a) match {
-        case NoneGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("GreaterThanOrEqualToExtractor does not match LessThanOrEqual(<expr>, <lit>).") {
-      val value = LessThanOrEqual(a, zero) match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test(
-      "GreaterThanOrEqualToExtractor does not match expressions other than GreaterThanOrEqual.") {
-      val value = zero match {
-        case AGreaterThanOrEqualTo(value) => Some(value)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // IsNullExtractor
-  {
-    val AIsNull = IsNullExtractor(aMatcher)
-    val NoneIsNull = IsNullExtractor(nonMatcher)
-
-    test("IsNullExtractor matches IsNull(<expr>) if matcher(<expr>).") {
-      val value = IsNull(a) match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNullExtractor does not match IsNull(<expr>) if !matcher(<expr>).") {
-      val value = IsNull(a) match {
-        case NoneIsNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNullExtractor matches EqualNullSafe(<expr>, null).") {
-      val value = EqualNullSafe(a, nullInt) match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNullExtractor does not match EqualNullSafe(<expr>, <lit>).") {
-      val value = EqualNullSafe(a, zero) match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNullExtractor matches EqualNullSafe(null, <expr>).") {
-      val value = EqualNullSafe(nullInt, a) match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNullExtractor does not match EqualNullSafe(<lit>, <expr>).") {
-      val value = EqualNullSafe(zero, a) match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNullExtractor does not match expressions other than IsNull/EqualNullSafe.") {
-      val value = zero match {
-        case AIsNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-  }
-
-  // IsNotNullExtractor
-  {
-    val AIsNotNull = IsNotNullExtractor(aMatcher)
-    val NoneIsNotNull = IsNotNullExtractor(nonMatcher)
-
-    test("IsNotNullExtractor matches IsNotNull(<expr>) if matcher(<expr>).") {
-      val value = IsNotNull(a) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNotNullExtractor does not match IsNotNull(<expr>) if !matcher(<expr>).") {
-      val value = IsNotNull(a) match {
-        case NoneIsNotNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNotNullExtractor matches Not(IsNull(<expr>)) if matcher(<expr>).") {
-      val value = Not(IsNull(a)) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNotNullExtractor matches Not(IsNull(<expr>)) if !matcher(<expr>).") {
-      val value = Not(IsNull(a)) match {
-        case NoneIsNotNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNotNullExtractor matches Not(EqualNullSafe(<expr>, null)).") {
-      val value = Not(EqualNullSafe(a, nullInt)) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNotNullExtractor does not match Not(EqualNullSafe(<expr>, <lit>)).") {
-      val value = Not(EqualNullSafe(a, zero)) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsNotNullExtractor matches Not(EqualNullSafe(null, <expr>)).") {
-      val value = Not(EqualNullSafe(nullInt, a)) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsNotNullExtractor does not match Not(EqualNullSafe(<lit>, <expr>)).") {
-      val value = Not(EqualNullSafe(zero, a)) match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test(
-      "IsNotNullExtractor does not match expressions other than IsNotNull/Not(EqualNullSafe).") {
-      val value = zero match {
-        case AIsNotNull() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-  }
-
-  // IsTrueExtractor
-  {
-    val AIsTrue = IsTrueExtractor(aMatcher)
-    val BIsTrue = IsTrueExtractor(bMatcher)
-    val NoneIsTrue = IsTrueExtractor(nonMatcher)
-
-    test("IsTrueExtractor matches <expr> if matcher(<expr>).") {
-      val value = b match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsTrueExtractor does not match <expr> if !matcher(<expr>).") {
-      val value = b match {
-        case NoneIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor does not match <expr> if type is not boolean.") {
-      val value = a match {
-        case AIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor matches EqualTo(<expr>, true) if matcher(<expr>).") {
-      val value = EqualTo(b, trueLit) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsTrueExtractor does not match EqualTo(<expr>, false).") {
-      val value = EqualTo(b, falseLit) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor matches EqualTo(true, <expr>) if matcher(<expr>).") {
-      val value = EqualTo(trueLit, b) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsTrueExtractor does not match EqualTo(false, <expr>).") {
-      val value = EqualTo(falseLit, b) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor matches EqualNullSafe(<expr>, true) if matcher(<expr>).") {
-      val value = EqualNullSafe(b, trueLit) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsTrueExtractor does not match EqualNullSafe(<expr>, false).") {
-      val value = EqualNullSafe(b, falseLit) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor matches EqualNullSafe(true, <expr>) if matcher(<expr>).") {
-      val value = EqualNullSafe(trueLit, b) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsTrueExtractor does not match EqualNullSafe(false, <expr>).") {
-      val value = EqualNullSafe(falseLit, b) match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsTrueExtractor does not match <lit>.") {
-      val value = zero match {
-        case BIsTrue() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-  }
-
-  // IsFalseExtractor
-  {
-    val BIsFalse = IsFalseExtractor(bMatcher)
-    val NoneIsFalse = IsFalseExtractor(nonMatcher)
-
-    test("IsFalseExtractor matches Not(<expr>) if matcher(<expr>).") {
-      val value = Not(b) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsFalseExtractor does not match Not(<expr>) if !matcher(<expr>).") {
-      val value = Not(b) match {
-        case NoneIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsFalseExtractor matches EqualTo(<expr>, false) if matcher(<expr>).") {
-      val value = EqualTo(b, falseLit) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsFalseExtractor does not match EqualTo(<expr>, true).") {
-      val value = EqualTo(b, trueLit) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsFalseExtractor matches EqualTo(false, <expr>) if matcher(<expr>).") {
-      val value = EqualTo(falseLit, b) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsFalseExtractor does not match EqualTo(true, <expr>).") {
-      val value = EqualTo(trueLit, b) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsFalseExtractor matches EqualNullSafe(<expr>, false) if matcher(<expr>).") {
-      val value = EqualNullSafe(b, falseLit) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsFalseExtractor does not match EqualNullSafe(<expr>, true).") {
-      val value = EqualNullSafe(b, trueLit) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsFalseExtractor matches EqualNullSafe(false, <expr>) if matcher(<expr>).") {
-      val value = EqualNullSafe(falseLit, b) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(value)
-    }
-
-    test("IsFalseExtractor does not match EqualNullSafe(true, <expr>).") {
-      val value = EqualNullSafe(trueLit, b) match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-
-    test("IsFalseExtractor does not match <lit>.") {
-      val value = zero match {
-        case BIsFalse() => true
-        case _ => false
-      }
-      assert(!value)
-    }
-  }
-
-  // InExtractor
-  {
-    val AIn = InExtractor(aMatcher)
-    val NoneIn = InExtractor(nonMatcher)
-
-    test("InExtractor matches In(<expr>, <lit>*) if matcher(<expr>).") {
-      val value = In(a, Seq(zero, one, two)) match {
-        case AIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === Some(Seq(zero, one, two)))
-    }
-
-    test("InExtractor matches In(<expr>, <lit>*) if !matcher(<expr>).") {
-      val value = In(a, Seq(zero, one, two)) match {
-        case NoneIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("InExtractor does not match In(<expr>, <lit>*) if <lit>* is empty.") {
-      val value = In(a, Nil) match {
-        case AIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("InExtractor matches In(<expr>, <lit>*) if some <lit>s are null.") {
-      val value = In(a, Seq(zero, nullInt, nullInt, two)) match {
-        case AIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === Some(Seq(zero, two)))
-    }
-
-    test("InExtractor does not match In(<expr>, <expr>*).") {
-      val value = In(a, Seq(zero, two, a)) match {
-        case AIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("InExtractor does not match other than In.") {
-      val value = a match {
-        case AIn(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-
-  // InSetExtractor
-  {
-    val AInSet = InSetExtractor(aMatcher)
-    val NoneInSet = InSetExtractor(nonMatcher)
-
-    test("InSetExtractor matches InSet(<expr>, <set>) if matcher(<expr>).") {
-      val value = InSet(a, Set(0, 1, 2)) match {
-        case AInSet(values) => Some(values)
-        case _ => None
-      }
-      assert(value === Some(Set(0, 1, 2)))
-    }
-
-    test("InSetExtractor matches InSet(<expr>, <lit>*) if !matcher(<expr>).") {
-      val value = InSet(a, Set(0, 1, 2)) match {
-        case NoneInSet(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-
-    test("InSetExtractor does not match other than InSet.") {
-      val value = a match {
-        case AInSet(values) => Some(values)
-        case _ => None
-      }
-      assert(value === None)
-    }
-  }
-}