Vasilis/docs (#370)

* restructured modules into folders. Deprecate warning for old naming conventions. * re-structured public module in docs * removed automl for now from docs until import is fixed * fixed wording in hyperparm tuning * fixed reference to hoenst forest in docs * added verbosity to bootstrap * mvoed bootstrap to private
2021-01-19 08:50:10 -05:00 · 2021-01-19 08:50:10 -05:00 · a27fa3cdaa
--- a/README.md
+++ b/README.md
@ -162,7 +162,7 @@ To install from source, see [For Developers](#for-developers) section below.
  <summary>Orthogonal Random Forests (click to expand)</summary>

  ```Python
-  from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+  from econml.orf import DMLOrthoForest, DROrthoForest
  from econml.sklearn_extensions.linear_model import WeightedLasso, WeightedLassoCV
  # Use defaults
  est = DMLOrthoForest()
@ -233,7 +233,7 @@ To install from source, see [For Developers](#for-developers) section below.
 * Linear final stage

 ```Python
-from econml.drlearner import LinearDRLearner
+from econml.dr import LinearDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

 est = LinearDRLearner(model_propensity=GradientBoostingClassifier(),
@ -246,7 +246,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Sparse linear final stage

 ```Python
-from econml.drlearner import SparseLinearDRLearner
+from econml.dr import SparseLinearDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

 est = SparseLinearDRLearner(model_propensity=GradientBoostingClassifier(),
@ -259,7 +259,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Nonparametric final stage

 ```Python
-from econml.drlearner import ForestDRLearner
+from econml.dr import ForestDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

 est = ForestDRLearner(model_propensity=GradientBoostingClassifier(),
@ -276,7 +276,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Intent to Treat Doubly Robust Learner (discrete instrument, discrete treatment)

 ```Python
-from econml.ortho_iv import LinearIntentToTreatDRIV
+from econml.iv.dr import LinearIntentToTreatDRIV
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 from sklearn.linear_model import LinearRegression

@ -295,7 +295,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05) # OLS confidence intervals

 ```Python
 import keras
-from econml.deepiv import DeepIVEstimator
+from econml.iv.nnet import DeepIV

 treatment_model = keras.Sequential([keras.layers.Dense(128, activation='relu', input_shape=(2,)),
                                    keras.layers.Dropout(0.17),
@ -310,11 +310,11 @@ response_model = keras.Sequential([keras.layers.Dense(128, activation='relu', in
                                  keras.layers.Dense(32, activation='relu'),
                                  keras.layers.Dropout(0.17),
                                  keras.layers.Dense(1)])
-est = DeepIVEstimator(n_components=10, # Number of gaussians in the mixture density networks)
-                      m=lambda z, x: treatment_model(keras.layers.concatenate([z, x])), # Treatment model
-                      h=lambda t, x: response_model(keras.layers.concatenate([t, x])), # Response model
-                      n_samples=1 # Number of samples used to estimate the response
-                      )
+est = DeepIV(n_components=10, # Number of gaussians in the mixture density networks)
+             m=lambda z, x: treatment_model(keras.layers.concatenate([z, x])), # Treatment model
+             h=lambda t, x: response_model(keras.layers.concatenate([t, x])), # Response model
+             n_samples=1 # Number of samples used to estimate the response
+             )
 est.fit(Y, T, X=X, Z=Z) # Z -> instrumental variables
 treatment_effects = est.effect(X_test)
 ```
--- a/doc/conf.py
+++ b/doc/conf.py
@ -212,7 +212,8 @@ epub_exclude_files = ['search.html']
 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
                       'numpy': ('https://docs.scipy.org/doc/numpy/', None),
                       'sklearn': ('https://scikit-learn.org/stable/', None),
-                       'matplotlib': ('https://matplotlib.org/', None)}
+                       'matplotlib': ('https://matplotlib.org/', None),
+                       'shap': ('https://shap.readthedocs.io/en/stable/', None)}

 # -- Options for todo extension ----------------------------------------------

--- a/doc/map.svg
+++ b/doc/map.svg
@ -94,24 +94,24 @@
        <path d="M9.50005 785.667C9.50005 771.768 20.7678 760.5 34.6674 760.5L369.333 760.5C383.232 760.5 394.5 771.768 394.5 785.667L394.5 886.333C394.5 900.232 383.232 911.5 369.333 911.5L34.6674 911.5C20.7678 911.5 9.50005 900.232 9.50005 886.333Z" stroke="#7030A0" stroke-width="1.33333" stroke-miterlimit="8" fill="#7030A0" fill-rule="evenodd" fill-opacity="0.341176"/>
        <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(111.984 896)">Exploit Explicit Randomization</text>
        <rect x="26" y="781" width="178" height="97" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.two_stage_least_squares.html" target="_parent">
+        <a href="_autosummary/econml.iv.sieve.SieveTSLS.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(93.7427 817)">2SLS IV</text>
        </a>
        <path d="M93.7427 819.138 114.743 819.138 135.743 819.138 135.743 820.138 114.743 820.138 93.7427 820.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.IntentToTreatDRIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.IntentToTreatDRIV" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(59.2427 834)">IntentToTreatDRIV</text>
        </a>
        <path d="M59.7427 836.138 96.7427 836.138 133.743 836.138 170.743 836.138 170.743 837.138 133.743 837.138 96.7427 837.138 59.7427 837.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.LinearIntentToTreatDRIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.LinearIntentToTreatDRIV.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(40.9094 852)">LinearIntentToTreatDRIV</text>
        </a>
        <path d="M40.7427 854.138 77.7427 854.138 114.743 854.138 151.743 854.138 188.743 854.138 188.743 855.138 151.743 855.138 114.743 855.138 77.7427 855.138 40.7427 855.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
        <rect x="216" y="781" width="163" height="97" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.deepiv.html" target="_parent">
+        <a href="_autosummary/econml.iv.nnet.DeepIV.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(274.239 808)">Deep IV</text>
        </a>
        <path d="M274.659 810.338 297.659 810.338 320.659 810.338 320.659 811.338 297.659 811.338 274.659 811.338Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.NonParamDMLIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.NonParamDMLIV.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(246.159 825)">NonParamDMLIV</text>
        </a>
        <path d="M246.659 827.338 280.659 827.338 314.659 827.338 348.659 827.338 348.659 828.338 314.659 828.338 280.659 828.338 246.659 828.338Z" fill="#FFFFFF" fill-rule="evenodd"/>
@ -171,54 +171,54 @@
           <tspan x="31.92" y="17">Experimentation</tspan>
        </text>
        <rect x="470" y="461" width="226" height="62" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.SparseLinearDML" target="_parent">
+        <a href="_autosummary/econml.dml.SparseLinearDML.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(530 488)">SparseLinearDML</text>
        </a>
        <path d="M489.012 490.461 536.012 490.461 583.012 490.461 630.012 490.461 677.012 490.461 677.012 491.461 630.012 491.461 583.012 491.461 536.012 491.461 489.012 491.461Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.SparseLinearDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.SparseLinearDRLearner.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(513.845 505)">SparseLinearDRLearner</text>
        </a>
        <path d="M514.012 507.461 560.345 507.461 606.678 507.461 653.012 507.461 653.012 508.461 606.678 508.461 560.345 508.461 514.012 508.461Z" fill="#FFFFFF" fill-rule="evenodd"/>
        <rect x="471" y="549" width="226" height="51" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.LinearDML" target="_parent">
+        <a href="_autosummary/econml.dml.LinearDML.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(550 571)">LinearDML</text>
        </a>
        <path d="M509.292 573.095 558.959 573.095 608.626 573.095 658.292 573.095 658.292 574.095 608.626 574.095 558.959 574.095 509.292 574.095Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.LinearDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.LinearDRLearner.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(533.872 588)">LinearDRLearner</text>
        </a>
        <path d="M534.292 590.095 583.792 590.095 633.292 590.095 633.292 591.095 583.792 591.095 534.292 591.095Z" fill="#FFFFFF" fill-rule="evenodd"/>
        <rect x="471" y="614" width="226" height="90" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.ortho_forest.html#econml.ortho_forest.DMLOrthoForest" target="_parent">
+        <a href="_autosummary/econml.orf.DMLOrthoForest.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(535 638)">DMLOrthoForest</text>
        </a>
        <path d="M482.292 639.866 533.042 639.866 583.792 639.866 634.542 639.866 685.292 639.866 685.292 640.866 634.542 640.866 583.792 640.866 533.042 640.866 482.292 640.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_forest.html#econml.ortho_forest.DROrthoForest" target="_parent">
+        <a href="_autosummary/econml.orf.DROrthoForest.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(540 655)">DROrthoForest</text>
        </a>
        <path d="M491.292 656.866 537.292 656.866 583.292 656.866 629.292 656.866 675.292 656.866 675.292 657.866 629.292 657.866 583.292 657.866 537.292 657.866 491.292 657.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.ForestDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.ForestDRLearner.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(533.539 673)">ForestDRLearner</text>
        </a>
        <path d="M533.292 674.866 583.292 674.866 633.292 674.866 633.292 675.866 583.292 675.866 533.292 675.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.CausalForestDML" target="_parent">
+        <a href="_autosummary/econml.dml.CausalForestDML.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(531 691)">CausalForestDML</text>
        </a>
        <path d="M509.292 692.866 558.959 692.866 608.626 692.866 658.292 692.866 658.292 693.866 608.626 693.866 558.959 693.866 509.292 693.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
        <rect x="471" y="716" width="226" height="131" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.metalearners.html" target="_parent">
+        <a href="reference.html#meta-learners" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(542.042 752)">MetaLearners</text>
        </a>
        <path d="M542.296 753.513 583.796 753.513 625.296 753.513 625.296 754.513 583.796 754.513 542.296 754.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.DRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.DRLearner.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(552.209 769)">DRLearner</text>
        </a>
        <path d="M552.296 770.513 583.796 770.513 615.296 770.513 615.296 771.513 583.796 771.513 552.296 771.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.DML" target="_parent">
+        <a href="_autosummary/econml.dml.DML.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(570 787)">DML</text>
        </a>
        <path d="M527.296 788.513 564.629 788.513 601.962 788.513 639.296 788.513 639.296 789.513 601.962 789.513 564.629 789.513 527.296 789.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.NonParamDML" target="_parent">
+        <a href="_autosummary/econml.dml.NonParamDML.html" target="_parent">
            <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(536 805)">NonParamDML</text>
        </a>
        <path d="M496.296 806.513 540.046 806.513 583.796 806.513 627.546 806.513 671.296 806.513 671.296 807.513 627.546 807.513 583.796 807.513 540.046 807.513 496.296 807.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
--- a/doc/reference.rst
+++ b/doc/reference.rst
@ -1,21 +1,232 @@
 Public Module Reference
 =======================

+CATE Estimators
+---------------
+
+.. _dml_api:
+
+Double Machine Learning (DML)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 .. autosummary::
    :toctree: _autosummary

-    econml.bootstrap
-    econml.cate_interpreter
-    econml.deepiv
-    econml.dml
-    econml.drlearner
-    econml.grf
-    econml.inference
-    econml.metalearners
-    econml.ortho_forest
-    econml.ortho_iv
-    econml.score
-    econml.two_stage_least_squares
+    econml.dml.DML
+    econml.dml.LinearDML
+    econml.dml.SparseLinearDML
+    econml.dml.CausalForestDML
+    econml.dml.NonParamDML
+
+.. _dr_api:
+
+Doubly Robust (DR)
+^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.dr.DRLearner
+    econml.dr.LinearDRLearner
+    econml.dr.SparseLinearDRLearner
+    econml.dr.ForestDRLearner
+
+.. _metalearners_api:
+
+Meta-Learners
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.metalearners.XLearner
+    econml.metalearners.TLearner
+    econml.metalearners.SLearner
+    econml.metalearners.DomainAdaptationLearner
+
+.. _orf_api:
+
+Orthogonal Random Forest (ORF)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.orf.DMLOrthoForest
+    econml.orf.DROrthoForest
+
+Instrumental Variable CATE Estimators
+-------------------------------------
+
+.. _dmliv_api:
+
+Double Machine Learning (DML) IV
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.dml.DMLATEIV
+    econml.iv.dml.ProjectedDMLATEIV
+    econml.iv.dml.DMLIV
+    econml.iv.dml.NonParamDMLIV
+
+.. _driv_api:
+
+Doubly Robust (DR) IV
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.dr.IntentToTreatDRIV
+    econml.iv.dr.LinearIntentToTreatDRIV
+
+.. _deepiv_api:
+
+DeepIV
+^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.nnet.DeepIV
+
+.. _tsls_api:
+
+Sieve Methods
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.sieve.SieveTSLS
+    econml.iv.sieve.HermiteFeatures
+    econml.iv.sieve.DPolynomialFeatures
+
+
+.. _interpreters_api:
+
+CATE Interpreters
+-----------------
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.cate_interpreter.SingleTreeCateInterpreter
+    econml.cate_interpreter.SingleTreePolicyInterpreter
+
+.. _scorers_api:
+
+CATE Scorers
+------------
+
+.. autosummary::
+    :toctree: _autosummary
+    
+    econml.score.RScorer
+    econml.score.EnsembleCateEstimator
+
+
+.. _grf_api:
+
+Generalized Random Forests
+--------------------------
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.grf.CausalForest
+    econml.grf.CausalIVForest
+    econml.grf.RegressionForest
+    econml.grf.MultiOutputGRF
+    econml.grf.LinearMomentGRFCriterion
+    econml.grf.LinearMomentGRFCriterionMSE
+    econml.grf._base_grf.BaseGRF
+    econml.grf._base_grftree.GRFTree
+
+
+.. Integration with AzureML AutoML
+.. -------------------------------
+
+.. .. autosummary::
+..     :toctree: _autosummary
+
+..     econml.automated_ml
+
+Scikit-Learn Extensions
+-----------------------
+
+.. _sklearn_linear_api:
+
+Linear Model Extensions
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.sklearn_extensions.linear_model.DebiasedLasso
+    econml.sklearn_extensions.linear_model.MultiOutputDebiasedLasso
+    econml.sklearn_extensions.linear_model.SelectiveRegularization
+    econml.sklearn_extensions.linear_model.StatsModelsLinearRegression
+    econml.sklearn_extensions.linear_model.StatsModelsRLM
+    econml.sklearn_extensions.linear_model.WeightedLasso
+    econml.sklearn_extensions.linear_model.WeightedLassoCV
+    econml.sklearn_extensions.linear_model.WeightedMultiTaskLassoCV
+    econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper
+
+.. _sklearn_model_api:
+
+Model Selection Extensions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.sklearn_extensions.model_selection.GridSearchCVList
+    econml.sklearn_extensions.model_selection.WeightedKFold
+    econml.sklearn_extensions.model_selection.WeightedStratifiedKFold
+
+
+.. _inference_api:
+
+Inference
+---------
+
+Inference Results
+^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.inference.NormalInferenceResults
+    econml.inference.EmpiricalInferenceResults
+    econml.inference.PopulationSummaryResults
+
+Inference Methods
+^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.inference.BootstrapInference
+    econml.inference.GenericModelFinalInference
+    econml.inference.GenericSingleTreatmentModelFinalInference
+    econml.inference.LinearModelFinalInference
+    econml.inference.StatsModelsInference
+    econml.inference.GenericModelFinalInferenceDiscrete
+    econml.inference.LinearModelFinalInferenceDiscrete
+    econml.inference.StatsModelsInferenceDiscrete
+
+
+.. _utilities_api:
+
+Utilities
+---------
+
+.. autosummary::
+    :toctree: _autosummary
+    
    econml.utilities

 Private Module Reference
@ -26,18 +237,5 @@ Private Module Reference

    econml._ortho_learner
    econml._cate_estimator
-    econml._causal_tree
-    econml._shap
    econml.dml._rlearner
-    econml.grf._base_grf
-    econml.grf._base_grftree
-    econml.grf._criterion
-
-Scikit-Learn Extensions
-=======================
-
-.. autosummary::
-    :toctree: _autosummary
-
-    econml.sklearn_extensions.linear_model
-    econml.sklearn_extensions.model_selection
+    econml.inference._bootstrap
--- a/doc/spec/comparison.rst
+++ b/doc/spec/comparison.rst
@ -9,7 +9,7 @@ Detailed estimator comparison
 +=============================================+==============+==============+==================+=============+=================+============+==============+====================+
 | :class:`.NonparametricTwoStageLeastSquares` | Any          | Yes          |                  | Yes         | Assumed         | Yes        | Yes          |                    |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
-| :class:`.DeepIVEstimator`                   | Any          | Yes          |                  |             |                 | Yes        | Yes          |                    |
+| :class:`.DeepIV`                            | Any          | Yes          |                  |             |                 | Yes        | Yes          |                    |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.SparseLinearDML`                   | Any          |              | Yes              | Yes         | Assumed         | Yes        | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
@ -27,7 +27,7 @@ Detailed estimator comparison
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.DROrthoForest`                     | Categorical  |              | Yes              |             |                 |            | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
-| :mod:`~econml.metalearners`                 | Categorical  |              |                  |             |                 | Yes        | Yes          | Yes                |
+| :ref:`metalearners <metalearners_api>`      | Categorical  |              |                  |             |                 | Yes        | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.DRLearner`                         | Categorical  |              |                  |             |                 |            | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
--- a/doc/spec/estimation/dml.rst
+++ b/doc/spec/estimation/dml.rst
@ -445,7 +445,8 @@ Usage FAQs

    Alternatively, you can pick the best first stage models outside of the EconML framework and pass in the selected models to EconML. 
    This can save on runtime and computational resources. Furthermore, it is statistically more stable since all data is being used for
-    training rather than a fold. E.g.:
+    hyper-parameter tuning rather than a single fold inside of the DML algorithm (as long as the number of hyperparameter values
+    that you are selecting over is not exponential in the number of samples, this approach is statistically valid). E.g.:

    .. testcode::

@ -723,7 +724,6 @@ lightning package implements such a class::
    from econml.dml import DML
    from sklearn.preprocessing import PolynomialFeatures
    from lightning.regression import FistaRegressor
-    from econml.bootstrap import BootstrapEstimator
    from sklearn.linear_model import MultiTaskElasticNet

    est = DML(model_y=MultiTaskElasticNet(alpha=0.1),
--- a/doc/spec/estimation/dr.rst
+++ b/doc/spec/estimation/dr.rst
@ -68,7 +68,7 @@ characteristics :math:`X` of the treated samples, then one can use this method.

 .. testcode::

-    from econml.drlearner import LinearDRLearner
+    from econml.dr import LinearDRLearner
    est = LinearDRLearner()
    est.fit(y, T, X=X, W=W)
    est.effect(X, T0=t0, T1=t1)
@ -195,7 +195,7 @@ is chosen for the final stage. The user can choose any regression/classification
 in all these variants. The hierarchy
 structure of the implemented CATE estimators is as follows.

-    .. inheritance-diagram:: econml.drlearner.DRLearner econml.drlearner.LinearDRLearner econml.drlearner.SparseLinearDRLearner econml.drlearner.ForestDRLearner
+    .. inheritance-diagram:: econml.dr.DRLearner econml.dr.LinearDRLearner econml.dr.SparseLinearDRLearner econml.dr.ForestDRLearner
        :parts: 1
        :private-bases:
        :top-classes: econml._ortho_learner._OrthoLearner, econml._cate_estimator.StatsModelsCateEstimatorDiscreteMixin, econml._cate_estimator.DebiasedLassoCateEstimatorDiscreteMixin
@ -209,7 +209,7 @@ Below we give a brief description of each of these classes:

      .. testcode::

-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
        from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
        est = DRLearner(model_regression=GradientBoostingRegressor(),
                        model_propensity=GradientBoostingClassifier(),
@ -224,7 +224,7 @@ Below we give a brief description of each of these classes:

      .. testcode::

-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
        from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        model_reg = lambda: GridSearchCV(
@ -260,7 +260,7 @@ Below we give a brief description of each of these classes:

          .. testcode::

-            from econml.drlearner import LinearDRLearner
+            from econml.dr import LinearDRLearner
            est = LinearDRLearner()
            est.fit(y, T, X=X, W=W)
            point = est.effect(X, T1=t1)
@ -281,7 +281,7 @@ Below we give a brief description of each of these classes:

          .. testcode::

-            from econml.drlearner import SparseLinearDRLearner
+            from econml.dr import SparseLinearDRLearner
            est = SparseLinearDRLearner()
            est.fit(y, T, X=X, W=W)
            point = est.effect(X, T1=T1)
@ -292,13 +292,13 @@ Below we give a brief description of each of these classes:

        - **ForestDRLearner.** The child class :class:`.ForestDRLearner` uses a Subsampled Honest Forest regressor
          as a final model (see [Wager2018]_ and [Athey2019]_). The subsampled honest forest is implemented in our library as a scikit-learn extension
-          of the :class:`~sklearn.ensemble.RandomForestRegressor`, in the class :class:`.SubsampledHonestForest`. This estimator
+          of the :class:`~sklearn.ensemble.RandomForestRegressor`, in the class :class:`~econml.grf.RegressionForest`. This estimator
          offers confidence intervals via the Bootstrap-of-Little-Bags as described in [Athey2019]_.
          Using this functionality we can also construct confidence intervals for the CATE:

          .. testcode::

-            from econml.drlearner import ForestDRLearner
+            from econml.dr import ForestDRLearner
            from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
            est = ForestDRLearner(model_regression=GradientBoostingRegressor(),
                                  model_propensity=GradientBoostingClassifier())
@ -323,7 +323,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import LinearDRLearner
+        from econml.dr import LinearDRLearner
        est = LinearDRLearner()
        est.fit(y, T, X=X, W=W)
        lb, ub = est.const_marginal_effect_interval(X, alpha=.05)
@ -341,7 +341,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import SparseLinearDRLearner
+        from econml.dr import SparseLinearDRLearner
        from sklearn.preprocessing import PolynomialFeatures
        est = SparseLinearDRLearner(featurizer=PolynomialFeatures(degree=3, include_bias=False))
        est.fit(y, T, X=X, W=W)
@ -355,7 +355,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import ForestDRLearner
+        from econml.dr import ForestDRLearner
        from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
        est = ForestDRLearner(model_regression=GradientBoostingRegressor(),
                              model_propensity=GradientBoostingClassifier())
@ -383,7 +383,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import SparseLinearDRLearner
+        from econml.dr import SparseLinearDRLearner
        from sklearn.linear_model import LassoCV, LogisticRegressionCV, ElasticNetCV
        from sklearn.ensemble import GradientBoostingRegressor
        est = SparseLinearDRLearner(model_regression=LassoCV(),
@ -409,7 +409,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
        from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        model_reg = lambda: GridSearchCV(
@ -433,7 +433,8 @@ Usage FAQs

    Alternatively, you can pick the best first stage models outside of the EconML framework and pass in the selected models to EconML. 
    This can save on runtime and computational resources. Furthermore, it is statistically more stable since all data is being used for
-    training rather than a fold. E.g.:
+    hyper-parameter tuning rather than a single fold inside of the DML algorithm (as long as the number of hyperparameter values
+    that you are selecting over is not exponential in the number of samples, this approach is statistically valid). E.g.:

    .. testcode::

@ -475,7 +476,7 @@ Usage FAQs

    .. testcode::

-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
        from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
        est = DRLearner(model_regression=RandomForestRegressor(oob_score=True),
                        model_propensity=RandomForestClassifier(min_samples_leaf=10, oob_score=True),
--- a/doc/spec/estimation/forest.rst
+++ b/doc/spec/estimation/forest.rst
@ -165,7 +165,7 @@ some extensions to the scikit-learn library that enable sample weights, such as
    .. testcode:: intro
        :hide:

-        from econml.ortho_forest import DMLOrthoForest
+        from econml.orf import DMLOrthoForest
        from econml.sklearn_extensions.linear_model import WeightedLasso

    .. doctest:: intro
@ -303,10 +303,10 @@ sample :math:`X_i`. This is implemented in the RegressionForest (see :class:`.Re
 Class Hierarchy Structure
 =========================

-.. inheritance-diagram:: econml.ortho_forest.DMLOrthoForest econml.ortho_forest.DROrthoForest econml.drlearner.ForestDRLearner econml.dml.CausalForestDML
+.. inheritance-diagram:: econml.orf.DMLOrthoForest econml.orf.DROrthoForest econml.dr.ForestDRLearner econml.dml.CausalForestDML
        :parts: 1
        :private-bases:
-        :top-classes: econml._ortho_learner._OrthoLearner, econml.ortho_forest.BaseOrthoForest, econml._cate_estimator.LinearCateEstimator
+        :top-classes: econml._ortho_learner._OrthoLearner, econml.orf.BaseOrthoForest, econml._cate_estimator.LinearCateEstimator


 Usage Examples
@ -323,7 +323,7 @@ and the `ForestLearners Jupyter notebook <https://github.com/microsoft/EconML/bl

        import numpy as np
        import sklearn
-        from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+        from econml.orf import DMLOrthoForest, DROrthoForest
        np.random.seed(123)

    >>> T = np.array([0, 1]*60)
@ -333,7 +333,7 @@ and the `ForestLearners Jupyter notebook <https://github.com/microsoft/EconML/bl
    ...                      model_T=sklearn.linear_model.LinearRegression(),
    ...                      model_Y=sklearn.linear_model.LinearRegression())
    >>> est.fit(Y, T, X=W, W=W)
-    <econml.ortho_forest.DMLOrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DMLOrthoForest object at 0x...>
    >>> print(est.effect(W[:2]))
    [1.00...  1.19...]

@ -346,7 +346,7 @@ Similarly, we can call :class:`.DROrthoForest`:
    ...                     propensity_model=sklearn.linear_model.LogisticRegression(),
    ...                     model_Y=sklearn.linear_model.LinearRegression())
    >>> est.fit(Y, T, X=W, W=W)
-    <econml.ortho_forest.DROrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DROrthoForest object at 0x...>
    >>> print(est.effect(W[:2]))
    [0.99...  1.35...]

@ -355,8 +355,8 @@ and with more realistic noisy data. In this case we can just use the default par
 of the class, which specify the use of the :class:`~sklearn.linear_model.LassoCV` for 
 both the treatment and the outcome regressions, in the case of continuous treatments.

-    >>> from econml.ortho_forest import DMLOrthoForest
-    >>> from econml.ortho_forest import DMLOrthoForest
+    >>> from econml.orf import DMLOrthoForest
+    >>> from econml.orf import DMLOrthoForest
    >>> from econml.sklearn_extensions.linear_model import WeightedLasso
    >>> import matplotlib.pyplot as plt
    >>> np.random.seed(123)
@ -370,7 +370,7 @@ both the treatment and the outcome regressions, in the case of continuous treatm
    ...                      model_Y=WeightedLasso(alpha=0.01),
    ...                      model_T=WeightedLasso(alpha=0.01))
    >>> est.fit(Y, T, X=X, W=W)
-    <econml.ortho_forest.DMLOrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DMLOrthoForest object at 0x...>
    >>> X_test = np.linspace(-1, 1, 30).reshape(-1, 1)
    >>> treatment_effects = est.effect(X_test)
    >>> plt.plot(X_test[:, 0], treatment_effects, label='ORF estimate')
--- a/doc/spec/estimation/metalearners.rst
+++ b/doc/spec/estimation/metalearners.rst
@ -135,7 +135,7 @@ See :ref:`Double Machine Learning User Guid <dmluserguide>`.
 Class Hierarchy Structure
 ==================================

-.. inheritance-diagram:: econml.metalearners.SLearner econml.metalearners.TLearner econml.metalearners.XLearner econml.metalearners.DomainAdaptationLearner econml.drlearner.DRLearner econml.dml.DML
+.. inheritance-diagram:: econml.metalearners.SLearner econml.metalearners.TLearner econml.metalearners.XLearner econml.metalearners.DomainAdaptationLearner econml.dr.DRLearner econml.dml.DML
        :parts: 1
        :private-bases:
        :top-classes: econml._ortho_learner._OrthoLearner, econml._cate_estimator.LinearCateEstimator, econml._cate_estimator.TreatmentExpansionMixin
--- a/doc/spec/estimation/two_sls.rst
+++ b/doc/spec/estimation/two_sls.rst
@ -2,7 +2,7 @@
 Sieve 2SLS Instrumental Variable Estimation
 ===========================================

-The sieve based instrumental variable module is based on a two-stage least squares estimation procedure.
+The sieve based instrumental variable estimator :class:`.SieveTSLS` is based on a two-stage least squares estimation procedure.
 The user must specify the sieve basis for :math:`T`, :math:`X` and :math:`Y` (Hermite polynomial or a set of indicator 
 functions), and the number of elements of the basis expansion to include. Formally, we now assume that we can write:

--- a/doc/spec/inference.rst
+++ b/doc/spec/inference.rst
@ -60,7 +60,7 @@ This for instance holds for the :class:`.LinearDML` and the :class:`.LinearDRLea

 .. testcode::

-    from econml.drlearner import LinearDRLearner
+    from econml.dr import LinearDRLearner
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    est = LinearDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                          model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@ -92,7 +92,7 @@ explicitly setting ``inference='debiasedlasso'``, e.g.:

 .. testcode::

-    from econml.drlearner import SparseLinearDRLearner
+    from econml.dr import SparseLinearDRLearner
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    est = SparseLinearDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                                model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@ -126,7 +126,7 @@ or by explicitly setting ``inference='blb'``, e.g.:

 .. testcode::

-    from econml.drlearner import ForestDRLearner
+    from econml.dr import ForestDRLearner
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    est = ForestDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                          model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@ -134,7 +134,7 @@ or by explicitly setting ``inference='blb'``, e.g.:
    point = est.effect(X)
    lb, ub = est.effect_interval(X, alpha=0.05)

-This inference is enabled by our implementation of the :class:`.SubsampledHonestForest` extension to the scikit-learn
+This inference is enabled by our implementation of the :class:`~econml.grf.RegressionForest` extension to the scikit-learn
 :class:`~sklearn.ensemble.RandomForestRegressor`.


@ -148,7 +148,7 @@ inference at its default setting of ``'auto'`` or by explicitly setting ``infere

 .. testcode::

-    from econml.ortho_forest import DMLOrthoForest
+    from econml.orf import DMLOrthoForest
    from econml.sklearn_extensions.linear_model import WeightedLasso
    est = DMLOrthoForest(n_trees=10,
                         min_leaf_size=3,
--- a/econml/init.py
+++ b/econml/init.py
@ -1,7 +1,10 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.

 __all__ = ['automated_ml', 'bootstrap',
           'cate_interpreter', 'causal_forest',
-           'data', 'deepiv', 'dml', 'drlearner', 'inference',
-           'metalearners', 'ortho_forest', 'ortho_iv',
+           'data', 'deepiv', 'dml', 'dr', 'drlearner',
+           'inference', 'iv',
+           'metalearners', 'ortho_forest', 'orf', 'ortho_iv',
           'score', 'sklearn_extensions', 'tree',
           'two_stage_least_squares', 'utilities']
--- a/econml/automated_ml/init.py
+++ b/econml/automated_ml/init.py
@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._automated_ml import (setAutomatedMLWorkspace, addAutomatedML,
+                            AutomatedMLModel, AutomatedMLMixin, EconAutoMLConfig)
+
+__all__ = ["setAutomatedMLWorkspace",
+           "addAutomatedML",
+           "AutomatedMLModel",
+           "AutomatedMLMixin",
+           "EconAutoMLConfig"]
--- a/econml/automated_ml/_automated_ml.py
+++ b/econml/automated_ml/_automated_ml.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 # AzureML
 from azureml.core.experiment import Experiment
 from azureml.core import Workspace
--- a/econml/bootstrap.py
+++ b/econml/bootstrap.py
@ -1,281 +1,17 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Bootstrap sampling."""
-import numpy as np
-from joblib import Parallel, delayed
-from sklearn.base import clone
-from scipy.stats import norm
-from collections import OrderedDict
-import pandas as pd
-
-
-class BootstrapEstimator:
-    """Estimator that uses bootstrap sampling to wrap an existing estimator.
-
-    This estimator provides a `fit` method with the same signature as the wrapped estimator.
-
-    The bootstrap estimator will also wrap all other methods and attributes of the wrapped estimator,
-    but return the average of the sampled calculations (this will fail for non-numeric outputs).
-
-    It will also provide a wrapper method suffixed with `_interval` for each method or attribute of
-    the wrapped estimator that takes two additional optional keyword arguments `lower` and `upper` specifiying
-    the percentiles of the interval, and which uses `np.percentile` to return the corresponding lower
-    and upper bounds based on the sampled calculations.  For example, if the underlying estimator supports
-    an `effect` method with signature `(X,T) -> Y`, this class will provide a method `effect_interval`
-    with pseudo-signature `(lower=5, upper=95, X, T) -> (Y, Y)` (where `lower` and `upper` cannot be
-    supplied as positional arguments).
-
-    Parameters
-    ----------
-    wrapped : object
-        The basis for the clones used for estimation.
-        This object must support a `fit` method which takes numpy arrays with consistent first dimensions
-        as arguments.
-
-    n_bootstrap_samples : int
-        How many draws to perform.
-
-    n_jobs: int, default: None
-        The maximum number of concurrently running jobs, as in joblib.Parallel.
-
-    compute_means : bool, default: True
-        Whether to pass calls through to the underlying collection and return the mean.  Setting this
-        to ``False`` can avoid ambiguities if the wrapped object itself has method names with an `_interval` suffix.
-
-    bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
-        Bootstrap method used to compute results.  'percentile' will result in using the empiracal CDF of
-        the replicated computations of the statistics.   'pivot' will also use the replicates but create a pivot
-        interval that also relies on the estimate over the entire dataset.  'normal' will instead compute an interval
-        assuming the replicates are normally distributed.
-    """
-
-    def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, bootstrap_type='pivot'):
-        self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
-        self._n_bootstrap_samples = n_bootstrap_samples
-        self._n_jobs = n_jobs
-        self._compute_means = compute_means
-        self._bootstrap_type = bootstrap_type
-        self._wrapped = wrapped
-
-    # TODO: Add a __dir__ implementation?
-
-    @staticmethod
-    def __stratified_indices(arr):
-        assert 1 <= np.ndim(arr) <= 2
-        unique = np.unique(arr, axis=0)
-        indices = []
-        for el in unique:
-            ind, = np.where(np.all(arr == el, axis=1) if np.ndim(arr) == 2 else arr == el)
-            indices.append(ind)
-        return indices
-
-    def fit(self, *args, **named_args):
-        """
-        Fit the model.
-
-        The full signature of this method is the same as that of the wrapped object's `fit` method.
-        """
-        from ._cate_estimator import BaseCateEstimator  # need to nest this here to avoid circular import
-
-        index_chunks = None
-        if isinstance(self._instances[0], BaseCateEstimator):
-            index_chunks = self._instances[0]._strata(*args, **named_args)
-            if index_chunks is not None:
-                index_chunks = self.__stratified_indices(index_chunks)
-        if index_chunks is None:
-            n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0]
-            index_chunks = [np.arange(n_samples)]  # one chunk with all indices
-
-        indices = []
-        for chunk in index_chunks:
-            n_samples = len(chunk)
-            indices.append(chunk[np.random.choice(n_samples,
-                                                  size=(self._n_bootstrap_samples, n_samples),
-                                                  replace=True)])
-
-        indices = np.hstack(indices)
-
-        def fit(x, *args, **kwargs):
-            x.fit(*args, **kwargs)
-            return x  # Explicitly return x in case fit fails to return its target
-
-        def convertArg(arg, inds):
-            if arg is None:
-                return None
-            arr = np.asarray(arg)
-            if arr.ndim > 0:
-                return arr[inds]
-            else:  # arg was a scalar, so we shouldn't have converted it
-                return arg
-
-        self._instances = Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
-            delayed(fit)(obj,
-                         *[convertArg(arg, inds) for arg in args],
-                         **{arg: convertArg(named_args[arg], inds) for arg in named_args})
-            for obj, inds in zip(self._instances, indices)
-        )
-        return self
-
-    def __getattr__(self, name):
-        """
-        Get proxy attribute that wraps the corresponding attribute with the same name from the wrapped object.
-
-        Additionally, the suffix "_interval" is supported for getting an interval instead of a point estimate.
-        """
-
-        # don't proxy special methods
-        if name.startswith('__'):
-            raise AttributeError(name)
-
-        def proxy(make_call, name, summary):
-            def summarize_with(f):
-                results = np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
-                    (f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name)
-                return summary(*results)
-            if make_call:
-                def call(*args, **kwargs):
-                    return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
-                return call
-            else:
-                return summarize_with(lambda obj, name: getattr(obj, name))
-
-        def get_mean():
-            # for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
-            return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))
-
-        def get_std():
-            prefix = name[: - len('_std')]
-            return proxy(callable(getattr(self._instances[0], prefix)), prefix,
-                         lambda arr, _: np.std(arr, axis=0))
-
-        def get_interval():
-            # if the attribute exists on the wrapped object once we remove the suffix,
-            # then we should be computing a confidence interval for the wrapped calls
-            prefix = name[: - len("_interval")]
-
-            def call_with_bounds(can_call, lower, upper):
-                def percentile_bootstrap(arr, _):
-                    return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)
-
-                def pivot_bootstrap(arr, est):
-                    return 2 * est - np.percentile(arr, upper, axis=0), 2 * est - np.percentile(arr, lower, axis=0)
-
-                def normal_bootstrap(arr, est):
-                    std = np.std(arr, axis=0)
-                    return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
-
-                # TODO: studentized bootstrap? this would be more accurate in most cases but can we avoid
-                #       second level bootstrap which would be prohibitive computationally?
-
-                fn = {'percentile': percentile_bootstrap,
-                      'normal': normal_bootstrap,
-                      'pivot': pivot_bootstrap}[self._bootstrap_type]
-                return proxy(can_call, prefix, fn)
-
-            can_call = callable(getattr(self._instances[0], prefix))
-            if can_call:
-                # collect extra arguments and pass them through, if the wrapped attribute was callable
-                def call(*args, lower=5, upper=95, **kwargs):
-                    return call_with_bounds(can_call, lower, upper)(*args, **kwargs)
-                return call
-            else:
-                # don't pass extra arguments if the wrapped attribute wasn't callable to begin with
-                def call(lower=5, upper=95):
-                    return call_with_bounds(can_call, lower, upper)
-                return call
-
-        def get_inference():
-            # can't import from econml.inference at top level without creating cyclical dependencies
-            from .inference import EmpiricalInferenceResults, NormalInferenceResults
-            from ._cate_estimator import LinearModelFinalCateEstimatorDiscreteMixin
-
-            prefix = name[: - len("_inference")]
-
-            def fname_transformer(x):
-                return x
-
-            if prefix in ['const_marginal_effect', 'marginal_effect', 'effect']:
-                inf_type = 'effect'
-            elif prefix == 'coef_':
-                inf_type = 'coefficient'
-                if (hasattr(self._instances[0], 'cate_feature_names') and
-                        callable(self._instances[0].cate_feature_names)):
-                    def fname_transformer(x):
-                        return self._instances[0].cate_feature_names(x)
-            elif prefix == 'intercept_':
-                inf_type = 'intercept'
-            else:
-                raise AttributeError("Unsupported inference: " + name)
-
-            d_t = self._wrapped._d_t[0] if self._wrapped._d_t else 1
-            if prefix == 'effect' or (isinstance(self._wrapped, LinearModelFinalCateEstimatorDiscreteMixin) and
-                                      (inf_type == 'coefficient' or inf_type == 'intercept')):
-                d_t = 1
-            d_y = self._wrapped._d_y[0] if self._wrapped._d_y else 1
-
-            can_call = callable(getattr(self._instances[0], prefix))
-
-            kind = self._bootstrap_type
-            if kind == 'percentile' or kind == 'pivot':
-                def get_dist(est, arr):
-                    if kind == 'percentile':
-                        return arr
-                    elif kind == 'pivot':
-                        return 2 * est - arr
-                    else:
-                        raise ValueError("Invalid kind, must be either 'percentile' or 'pivot'")
-
-                def get_result():
-                    return proxy(can_call, prefix,
-                                 lambda arr, est: EmpiricalInferenceResults(
-                                     d_t=d_t, d_y=d_y,
-                                     pred=est, pred_dist=get_dist(est, arr),
-                                     inf_type=inf_type,
-                                     fname_transformer=fname_transformer,
-                                     **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None))
-
-                # Note that inference results are always methods even if the inference is for a property
-                # (e.g. coef__inference() is a method but coef_ is a property)
-                # Therefore we must insert a lambda if getting inference for a non-callable
-                return get_result() if can_call else get_result
-
-            else:
-                assert kind == 'normal'
-
-                def normal_inference(*args, **kwargs):
-                    pred = getattr(self._wrapped, prefix)
-                    if can_call:
-                        pred = pred(*args, **kwargs)
-                    stderr = getattr(self, prefix + '_std')
-                    if can_call:
-                        stderr = stderr(*args, **kwargs)
-                    return NormalInferenceResults(
-                        d_t=d_t, d_y=d_y, pred=pred,
-                        pred_stderr=stderr, inf_type=inf_type,
-                        fname_transformer=fname_transformer,
-                        **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None)
-
-                # If inference is for a property, create a fresh lambda to avoid passing args through
-                return normal_inference if can_call else lambda: normal_inference()
-
-        caught = None
-        m = None
-        if name.endswith("_interval"):
-            m = get_interval
-        elif name.endswith("_std"):
-            m = get_std
-        elif name.endswith("_inference"):
-            m = get_inference
-
-        # try to get interval/std first if appropriate,
-        # since we don't prefer a wrapped method with this name
-        if m is not None:
-            try:
-                return m()
-            except AttributeError as err:
-                caught = err
-        if self._compute_means:
-            return get_mean()
-
-        raise (caught if caught else AttributeError(name))
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.inference._bootstrap as bootstrap
+from .utilities import deprecated
+
+
+@deprecated("The econml.bootstrap.BootstrapEstimator class has been moved "
+            "to econml.inference._bootstrap.BootstrapEstimator and is no longer part of the public API; "
+            "an upcoming release will remove support for the old name and will consider `BootstrapEstimator` "
+            "as part of the private API with no guarantee of API consistency across releases. "
+            "Instead of wrapping CATE esitmators with the BootstrapEstimator to preduce bootstrap confidence "
+            "intervals, consider passing `inference='bootstrap'` or "
+            "`inference=econml.inference.BootstrapInference(n_bootstrap_samples=..., bootstrap_type=...)`, "
+            "as a keyword argument at the `fit` method of the CATE estimator.")
+class BootstrapEstimator(bootstrap.BootstrapEstimator):
+    pass
--- a/econml/causal_forest.py
+++ b/econml/causal_forest.py
@ -1,10 +1,12 @@
-from .ortho_forest import DMLOrthoForest
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 from .utilities import LassoCVWrapper, deprecated
 from sklearn.linear_model import LogisticRegressionCV
 from .dml import CausalForestDML


-@deprecated("The CausalForest class has been deprecated by the CausalForestDML; "
+@deprecated("The CausalForest class has been deprecated by the econml.dml.CausalForestDML; "
            "an upcoming release will remove support for the old class")
 def CausalForest(n_trees=500,
                 min_leaf_size=10,
--- a/econml/deepiv.py
+++ b/econml/deepiv.py
@ -1,461 +1,17 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Deep IV estimator and related components."""
-
-import numpy as np
-import keras
-from ._cate_estimator import BaseCateEstimator
-from .utilities import deprecated
-from keras import backend as K
-import keras.layers as L
-from keras.models import Model
-from econml.utilities import check_input_arrays, _deprecate_positional
-
-# TODO: make sure to use random seeds wherever necessary
-# TODO: make sure that the public API consistently uses "T" instead of "P" for the treatment
-
-# unfortunately with the Theano and Tensorflow backends,
-# the straightforward use of K.stop_gradient can cause an error
-# because the parameters of the intermediate layers are now disconnected from the loss;
-# therefore we add a pointless multiplication by 0 to the values in each of the variables in vs
-# so that those layers remain connected but with 0 gradient
-
-
-def _zero_grad(e, vs):
-    if K.backend() == 'cntk':
-        return K.stop_gradient(e)
-    else:
-        z = 0 * K.sum(K.concatenate([K.batch_flatten(v) for v in vs]))
-        return K.stop_gradient(e) + z
-
-
-def mog_model(n_components, d_x, d_t):
-    """
-    Create a mixture of Gaussians model with the specified number of components.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_x : int
-        The number of dimensions in the layer used as input
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes an input of dimension `d_t` and generates three outputs: pi, mu, and sigma
-
-    """
-    x = L.Input((d_x,))
-    pi = L.Dense(n_components, activation='softmax')(x)
-    mu = L.Reshape((n_components, d_t))(L.Dense(n_components * d_t)(x))
-    log_sig = L.Dense(n_components)(x)
-    sig = L.Lambda(K.exp)(log_sig)
-    return Model([x], [pi, mu, sig])
-
-
-def mog_loss_model(n_components, d_t):
-    """
-    Create a Keras model that computes the loss of a mixture of Gaussians model on data.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes as inputs pi, mu, sigma, and t and generates a single output containing the loss.
-
-    """
-    pi = L.Input((n_components,))
-    mu = L.Input((n_components, d_t))
-    sig = L.Input((n_components,))
-    t = L.Input((d_t,))
-
-    # || t - mu_i || ^2
-    d2 = L.Lambda(lambda d: K.sum(K.square(d), axis=-1),
-                  output_shape=(n_components,))(
-        L.Subtract()([L.RepeatVector(n_components)(t), mu])
-    )
-
-    # LL = C - log(sum(pi_i/sig^d * exp(-d2/(2*sig^2))))
-    # Use logsumexp for numeric stability:
-    # LL = C - log(sum(exp(-d2/(2*sig^2) + log(pi_i/sig^d))))
-    # TODO: does the numeric stability actually make any difference?
-    def make_logloss(d2, sig, pi):
-        return -K.logsumexp(-d2 / (2 * K.square(sig)) + K.log(pi / K.pow(sig, d_t)), axis=-1)
-
-    ll = L.Lambda(lambda dsp: make_logloss(*dsp), output_shape=(1,))([d2, sig, pi])
-
-    m = Model([pi, mu, sig, t], [ll])
-    return m
-
-
-def mog_sample_model(n_components, d_t):
-    """
-    Create a model that generates samples from a mixture of Gaussians.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes as inputs pi, mu, and sigma, and generates a single output containing a sample.
-
-    """
-    pi = L.Input((n_components,))
-    mu = L.Input((n_components, d_t))
-    sig = L.Input((n_components,))
-
-    # CNTK backend can't randomize across batches and doesn't implement cumsum (at least as of June 2018,
-    # see Known Issues on https://docs.microsoft.com/en-us/cognitive-toolkit/Using-CNTK-with-Keras)
-    def sample(pi, mu, sig):
-        batch_size = K.shape(pi)[0]
-        if K.backend() == 'cntk':
-            # generate cumulative sum via matrix multiplication
-            cumsum = K.dot(pi, K.constant(np.triu(np.ones((n_components, n_components)))))
-        else:
-            cumsum = K.cumsum(pi, 1)
-        cumsum_shift = K.concatenate([K.zeros_like(cumsum[:, 0:1]), cumsum])[:, :-1]
-        if K.backend() == 'cntk':
-            import cntk as C
-            # Generate standard uniform values in shape (batch_size,1)
-            #   (since we can't use the dynamic batch_size with random.uniform in CNTK,
-            #    we use uniform_like instead with an input of an appropriate shape)
-            rndSmp = C.random.uniform_like(pi[:, 0:1])
-        else:
-            rndSmp = K.random_uniform((batch_size, 1))
-        cmp1 = K.less_equal(cumsum_shift, rndSmp)
-        cmp2 = K.less(rndSmp, cumsum)
-
-        # convert to floats and multiply to perform equivalent of logical AND
-        rndIndex = K.cast(cmp1, K.floatx()) * K.cast(cmp2, K.floatx())
-
-        if K.backend() == 'cntk':
-            # Generate standard normal values in shape (batch_size,1,d_t)
-            #   (since we can't use the dynamic batch_size with random.normal in CNTK,
-            #    we use normal_like instead with an input of an appropriate shape)
-            rndNorms = C.random.normal_like(mu[:, 0:1, :])  # K.random_normal((1,d_t))
-        else:
-            rndNorms = K.random_normal((batch_size, 1, d_t))
-
-        rndVec = mu + K.expand_dims(sig) * rndNorms
-
-        # exactly one entry should be nonzero for each b,d combination; use sum to select it
-        return K.sum(K.expand_dims(rndIndex) * rndVec, 1)
-
-    # prevent gradient from passing through sampling
-    samp = L.Lambda(lambda pms: _zero_grad(sample(*pms), pms), output_shape=(d_t,))
-    samp.trainable = False
-
-    return Model([pi, mu, sig], samp([pi, mu, sig]))
-
-
-# three options: biased or upper-bound loss require a single number of samples;
-#                unbiased can take different numbers for the network and its gradient
-def response_loss_model(h, p, d_z, d_x, d_y, samples=1, use_upper_bound=False, gradient_samples=0):
-    """
-    Create a Keras model that computes the loss of a response model on data.
-
-    Parameters
-    ----------
-    h : (tensor, tensor) -> Layer
-        Method for building a model of y given p and x
-
-    p : (tensor, tensor) -> Layer
-        Method for building a model of p given z and x
-
-    d_z : int
-        The number of dimensions in z
-
-    d_x :  int
-        Tbe number of dimensions in x
-
-    d_y : int
-        The number of dimensions in y
-
-    samples: int
-        The number of samples to use
-
-    use_upper_bound : bool
-        Whether to use an upper bound to the true loss
-        (equivalent to adding a regularization penalty on the variance of h)
-
-    gradient_samples : int
-        The number of separate additional samples to use when calculating the gradient.
-        This can only be nonzero if user_upper_bound is False, in which case the gradient of
-        the returned loss will be an unbiased estimate of the gradient of the true loss.
-
-    Returns
-    -------
-    A Keras model that takes as inputs z, x, and y and generates a single output containing the loss.
-
-    """
-    assert not(use_upper_bound and gradient_samples)
-
-    # sample: (() -> Layer, int) -> Layer
-    def sample(f, n):
-        assert n > 0
-        if n == 1:
-            return f()
-        else:
-            return L.average([f() for _ in range(n)])
-    z, x, y = [L.Input((d,)) for d in [d_z, d_x, d_y]]
-    if gradient_samples:
-        # we want to separately sample the gradient; we use stop_gradient to treat the sampled model as constant
-        # the overall computation ensures that we have an interpretable loss (y-h̅(p,x))²,
-        # but also that the gradient is -2(y-h̅(p,x))∇h̅(p,x) with *different* samples used for each average
-        diff = L.subtract([y, sample(lambda: h(p(z, x), x), samples)])
-        grad = sample(lambda: h(p(z, x), x), gradient_samples)
-
-        def make_expr(grad, diff):
-            return K.stop_gradient(diff) * (K.stop_gradient(diff + 2 * grad) - 2 * grad)
-        expr = L.Lambda(lambda args: make_expr(*args))([grad, diff])
-    elif use_upper_bound:
-        expr = sample(lambda: L.Lambda(K.square)(L.subtract([y, h(p(z, x), x)])), samples)
-    else:
-        expr = L.Lambda(K.square)(L.subtract([y, sample(lambda: h(p(z, x), x), samples)]))
-    return Model([z, x, y], [expr])
-
-
-class DeepIV(BaseCateEstimator):
-    """
-    The Deep IV Estimator (see http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf).
-
-    Parameters
-    ----------
-    n_components : int
-        Number of components in the mixture density network
-
-    m : (tensor, tensor) -> Layer
-        Method for building a Keras model that featurizes the z and x inputs
-
-    h : (tensor, tensor) -> Layer
-        Method for building a model of y given t and x
-
-    n_samples : int
-        The number of samples to use
-
-    use_upper_bound_loss : bool, optional
-        Whether to use an upper bound to the true loss
-        (equivalent to adding a regularization penalty on the variance of h).
-        Defaults to False.
-
-    n_gradient_samples : int, optional
-        The number of separate additional samples to use when calculating the gradient.
-        This can only be nonzero if user_upper_bound is False, in which case the gradient of
-        the returned loss will be an unbiased estimate of the gradient of the true loss.
-        Defaults to 0.
-
-    optimizer : string, optional
-        The optimizer to use. Defaults to "adam"
-
-    first_stage_options : dictionary, optional
-        The keyword arguments to pass to Keras's `fit` method when training the first stage model.
-        Defaults to `{"epochs": 100}`.
-
-    second_stage_options : dictionary, optional
-        The keyword arguments to pass to Keras's `fit` method when training the second stage model.
-        Defaults to `{"epochs": 100}`.
-
-    """
-
-    def __init__(self, *,
-                 n_components,
-                 m,
-                 h,
-                 n_samples, use_upper_bound_loss=False, n_gradient_samples=0,
-                 optimizer='adam',
-                 first_stage_options={"epochs": 100},
-                 second_stage_options={"epochs": 100}):
-        self._n_components = n_components
-        self._m = m
-        self._h = h
-        self._n_samples = n_samples
-        self._use_upper_bound_loss = use_upper_bound_loss
-        self._n_gradient_samples = n_gradient_samples
-        self._optimizer = optimizer
-        self._first_stage_options = first_stage_options
-        self._second_stage_options = second_stage_options
-        super().__init__()
-
-    @_deprecate_positional("X and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X and Z by position.", ['X', 'Z'])
-    @BaseCateEstimator._wrap_fit
-    def fit(self, Y, T, X, Z, *, inference=None):
-        """Estimate the counterfactual model from data.
-
-        That is, estimate functions τ(·, ·, ·), ∂τ(·, ·).
-
-        Parameters
-        ----------
-        Y: (n × d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n × dₜ) matrix or vector of length n
-            Treatments for each sample
-        X: (n × dₓ) matrix
-            Features for each sample
-        Z: (n × d_z) matrix
-            Instruments for each sample
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`)
-
-        Returns
-        -------
-        self
-
-        """
-        Y, T, X, Z = check_input_arrays(Y, T, X, Z)
-        assert 1 <= np.ndim(X) <= 2
-        assert 1 <= np.ndim(Z) <= 2
-        assert 1 <= np.ndim(T) <= 2
-        assert 1 <= np.ndim(Y) <= 2
-        assert np.shape(X)[0] == np.shape(Y)[0] == np.shape(T)[0] == np.shape(Z)[0]
-
-        # in case vectors were passed for Y or T, keep track of trailing dims for reshaping effect output
-
-        d_x, d_y, d_z, d_t = [np.shape(a)[1] if np.ndim(a) > 1 else 1 for a in [X, Y, Z, T]]
-        x_in, y_in, z_in, t_in = [L.Input((d,)) for d in [d_x, d_y, d_z, d_t]]
-        n_components = self._n_components
-
-        treatment_network = self._m(z_in, x_in)
-
-        # the dimensionality of the output of the network
-        # TODO: is there a more robust way to do this?
-        d_n = K.int_shape(treatment_network)[-1]
-
-        pi, mu, sig = mog_model(n_components, d_n, d_t)([treatment_network])
-
-        ll = mog_loss_model(n_components, d_t)([pi, mu, sig, t_in])
-
-        model = Model([z_in, x_in, t_in], [ll])
-        model.add_loss(L.Lambda(K.mean)(ll))
-        model.compile(self._optimizer)
-        # TODO: do we need to give the user more control over other arguments to fit?
-        model.fit([Z, X, T], [], **self._first_stage_options)
-
-        lm = response_loss_model(lambda t, x: self._h(t, x),
-                                 lambda z, x: Model([z_in, x_in],
-                                                    # subtle point: we need to build a new model each time,
-                                                    # because each model encapsulates its randomness
-                                                    [mog_sample_model(n_components, d_t)([pi, mu, sig])])([z, x]),
-                                 d_z, d_x, d_y,
-                                 self._n_samples, self._use_upper_bound_loss, self._n_gradient_samples)
-
-        rl = lm([z_in, x_in, y_in])
-        response_model = Model([z_in, x_in, y_in], [rl])
-        response_model.add_loss(L.Lambda(K.mean)(rl))
-        response_model.compile(self._optimizer)
-        # TODO: do we need to give the user more control over other arguments to fit?
-        response_model.fit([Z, X, Y], [], **self._second_stage_options)
-
-        self._effect_model = Model([t_in, x_in], [self._h(t_in, x_in)])
-
-        # TODO: it seems like we need to sum over the batch because we can only apply gradient to a scalar,
-        #       not a general tensor (because of how backprop works in every framework)
-        #       (alternatively, we could iterate through the batch in addition to iterating through the output,
-        #       but this seems annoying...)
-        #       Therefore, it's important that we use a batch size of 1 when we call predict with this model
-        def calc_grad(t, x):
-            h = self._h(t, x)
-            all_grads = K.concatenate([g
-                                       for i in range(d_y)
-                                       for g in K.gradients(K.sum(h[:, i]), [t])])
-            return K.reshape(all_grads, (-1, d_y, d_t))
-
-        self._marginal_effect_model = Model([t_in, x_in], L.Lambda(lambda tx: calc_grad(*tx))([t_in, x_in]))
-
-    def effect(self, X=None, T0=0, T1=1):
-        """
-        Calculate the heterogeneous treatment effect τ(·,·,·).
-
-        The effect is calculated between the two treatment points
-        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T0: (m × dₜ) matrix
-            Base treatments for each sample
-        T1: (m × dₜ) matrix
-            Target treatments for each sample
-        X: optional (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        τ: (m × d_y) matrix
-            Heterogeneous treatment effects on each outcome for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-        """
-        X, T0, T1 = check_input_arrays(X, T0, T1)
-        if np.ndim(T0) == 0:
-            T0 = np.repeat(T0, 1 if X is None else np.shape(X)[0])
-        if np.ndim(T1) == 0:
-            T1 = np.repeat(T1, 1 if X is None else np.shape(X)[0])
-        if X is None:
-            X = np.empty((np.shape(T0)[0], 0))
-        return (self._effect_model.predict([T1, X]) - self._effect_model.predict([T0, X])).reshape((-1,) + self._d_y)
-
-    def marginal_effect(self, T, X=None):
-        """
-        Calculate the marginal effect ∂τ(·, ·) around a base treatment point conditional on features.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: optional(m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        grad_tau: (m × d_y × dₜ) array
-            Heterogeneous marginal effects on each outcome for each sample
-            Note that when Y or T is a vector rather than a 2-dimensional array,
-            the corresponding singleton dimensions in the output will be collapsed
-            (e.g. if both are vectors, then the output of this method will also be a vector)
-        """
-        T, X = check_input_arrays(T, X)
-        # TODO: any way to get this to work on batches of arbitrary size?
-        return self._marginal_effect_model.predict([T, X], batch_size=1).reshape((-1,) + self._d_y + self._d_t)
-
-    def predict(self, T, X):
-        """Predict outcomes given treatment assignments and features.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        Y: (m × d_y) matrix
-            Outcomes for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-        """
-        T, X = check_input_arrays(T, X)
-        return self._effect_model.predict([T, X]).reshape((-1,) + self._d_y)
-
-
-@deprecated("The DeepIVEstimator class has been renamed to DeepIV; "
-            "an upcoming release will remove support for the old name")
-class DeepIVEstimator(DeepIV):
-    pass
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.iv.nnet as nnet
+from .utilities import deprecated
+
+
+@deprecated("The econml.deepiv.DeepIV class has renamed to econml.iv.nnet.DeepIV; "
+            "an upcoming release will remove support for the old name")
+class DeepIV(nnet.DeepIV):
+    pass
+
+
+@deprecated("The econml.deepiv.DeepIVEstimator class has been renamed to econml.iv.nnet.DeepIV; "
+            "an upcoming release will remove support for the old name")
+class DeepIVEstimator(nnet.DeepIV):
+    pass
--- a/econml/dml/causal_forest.py
+++ b/econml/dml/causal_forest.py
@ -8,7 +8,8 @@ from .dml import _BaseDML
 from .dml import _FirstStageWrapper, _FinalWrapper
 from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper
 from ..sklearn_extensions.model_selection import WeightedStratifiedKFold
-from ..inference import Inference, NormalInferenceResults
+from ..inference import NormalInferenceResults
+from ..inference._inference import Inference
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.base import clone, BaseEstimator
 from sklearn.preprocessing import FunctionTransformer
@ -100,7 +101,7 @@ class _GenericSingleOutcomeModelFinalWithCovInference(Inference):


 class CausalForestDML(_BaseDML):
-    """A Causal Forest [1]_ combined with double machine learning based residualization of the treatment
+    """A Causal Forest [cfdml1]_ combined with double machine learning based residualization of the treatment
    and outcome variables. It fits a forest that solves the local moment equation problem:

    .. code-block::
@ -203,7 +204,7 @@ class CausalForestDML(_BaseDML):

            weight(left) * weight(right) || theta(left) - theta(right)||_2^2 / weight(parent)^2

-          as outlined in [1]_
+          as outlined in [cfdml1]_

    max_depth : int, default=None
        The maximum depth of the tree. If None, then nodes are expanded until
@ -370,7 +371,7 @@ class CausalForestDML(_BaseDML):

    References
    ----------
-    .. [1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
+    .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
        The Annals of Statistics 47.2 (2019): 1148-1178
        https://arxiv.org/pdf/1610.01271.pdf

--- a/econml/dml/dml.py
+++ b/econml/dml/dml.py
@ -23,7 +23,6 @@ from .._cate_estimator import (DebiasedLassoCateEstimatorMixin,
                               StatsModelsCateEstimatorMixin,
                               LinearCateEstimator)
 from ..inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference
-from ..sklearn_extensions.ensemble import SubsampledHonestForest
 from ..sklearn_extensions.linear_model import (MultiOutputDebiasedLasso,
                                               StatsModelsLinearRegression,
                                               WeightedLassoCVWrapper)
@ -1217,7 +1216,7 @@ def ForestDML(model_y, model_t,
              verbose=0,
              random_state=None):
    """ Instance of NonParamDML with a
-    :class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
+    :class:``~econml.grf.RegressionForest`
    as a final model, so as to enable non-parametric inference.

    Parameters
--- a/econml/dr/init.py
+++ b/econml/dr/init.py
@ -0,0 +1,9 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+
+__all__ = ["DRLearner",
+           "LinearDRLearner",
+           "SparseLinearDRLearner",
+           "ForestDRLearner"]
--- a/econml/dr/_drlearner.py
+++ b/econml/dr/_drlearner.py
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
--- a/econml/inference/init.py
+++ b/econml/inference/init.py
@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._inference import (BootstrapInference, GenericModelFinalInference, GenericSingleTreatmentModelFinalInference,
+                         LinearModelFinalInference, StatsModelsInference, GenericModelFinalInferenceDiscrete,
+                         LinearModelFinalInferenceDiscrete, StatsModelsInferenceDiscrete,
+                         NormalInferenceResults, EmpiricalInferenceResults,
+                         PopulationSummaryResults)
+
+__all__ = ["BootstrapInference",
+           "GenericModelFinalInference",
+           "GenericSingleTreatmentModelFinalInference",
+           "LinearModelFinalInference",
+           "StatsModelsInference",
+           "GenericModelFinalInferenceDiscrete",
+           "LinearModelFinalInferenceDiscrete",
+           "StatsModelsInferenceDiscrete",
+           "NormalInferenceResults",
+           "EmpiricalInferenceResults",
+           "PopulationSummaryResults"]
--- a/econml/inference/_bootstrap.py
+++ b/econml/inference/_bootstrap.py
@ -0,0 +1,290 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Bootstrap sampling."""
+import numpy as np
+from joblib import Parallel, delayed
+from sklearn.base import clone
+from scipy.stats import norm
+from collections import OrderedDict
+import pandas as pd
+
+
+class BootstrapEstimator:
+    """Estimator that uses bootstrap sampling to wrap an existing estimator.
+
+    This estimator provides a `fit` method with the same signature as the wrapped estimator.
+
+    The bootstrap estimator will also wrap all other methods and attributes of the wrapped estimator,
+    but return the average of the sampled calculations (this will fail for non-numeric outputs).
+
+    It will also provide a wrapper method suffixed with `_interval` for each method or attribute of
+    the wrapped estimator that takes two additional optional keyword arguments `lower` and `upper` specifiying
+    the percentiles of the interval, and which uses `np.percentile` to return the corresponding lower
+    and upper bounds based on the sampled calculations.  For example, if the underlying estimator supports
+    an `effect` method with signature `(X,T) -> Y`, this class will provide a method `effect_interval`
+    with pseudo-signature `(lower=5, upper=95, X, T) -> (Y, Y)` (where `lower` and `upper` cannot be
+    supplied as positional arguments).
+
+    Parameters
+    ----------
+    wrapped : object
+        The basis for the clones used for estimation.
+        This object must support a `fit` method which takes numpy arrays with consistent first dimensions
+        as arguments.
+
+    n_bootstrap_samples : int, default: 100
+        How many draws to perform.
+
+    n_jobs: int, default: None
+        The maximum number of concurrently running jobs, as in joblib.Parallel.
+
+    verbose: int, default: 0
+        Verbosity level
+
+    compute_means : bool, default: True
+        Whether to pass calls through to the underlying collection and return the mean.  Setting this
+        to ``False`` can avoid ambiguities if the wrapped object itself has method names with an `_interval` suffix.
+
+    bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
+        Bootstrap method used to compute results.  'percentile' will result in using the empiracal CDF of
+        the replicated computations of the statistics.   'pivot' will also use the replicates but create a pivot
+        interval that also relies on the estimate over the entire dataset.  'normal' will instead compute an interval
+        assuming the replicates are normally distributed.
+    """
+
+    def __init__(self, wrapped,
+                 n_bootstrap_samples=100,
+                 n_jobs=None,
+                 verbose=0,
+                 compute_means=True,
+                 bootstrap_type='pivot'):
+        self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
+        self._n_bootstrap_samples = n_bootstrap_samples
+        self._n_jobs = n_jobs
+        self._verbose = verbose
+        self._compute_means = compute_means
+        self._bootstrap_type = bootstrap_type
+        self._wrapped = wrapped
+
+    # TODO: Add a __dir__ implementation?
+
+    @staticmethod
+    def __stratified_indices(arr):
+        assert 1 <= np.ndim(arr) <= 2
+        unique = np.unique(arr, axis=0)
+        indices = []
+        for el in unique:
+            ind, = np.where(np.all(arr == el, axis=1) if np.ndim(arr) == 2 else arr == el)
+            indices.append(ind)
+        return indices
+
+    def fit(self, *args, **named_args):
+        """
+        Fit the model.
+
+        The full signature of this method is the same as that of the wrapped object's `fit` method.
+        """
+        from .._cate_estimator import BaseCateEstimator  # need to nest this here to avoid circular import
+
+        index_chunks = None
+        if isinstance(self._instances[0], BaseCateEstimator):
+            index_chunks = self._instances[0]._strata(*args, **named_args)
+            if index_chunks is not None:
+                index_chunks = self.__stratified_indices(index_chunks)
+        if index_chunks is None:
+            n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0]
+            index_chunks = [np.arange(n_samples)]  # one chunk with all indices
+
+        indices = []
+        for chunk in index_chunks:
+            n_samples = len(chunk)
+            indices.append(chunk[np.random.choice(n_samples,
+                                                  size=(self._n_bootstrap_samples, n_samples),
+                                                  replace=True)])
+
+        indices = np.hstack(indices)
+
+        def fit(x, *args, **kwargs):
+            x.fit(*args, **kwargs)
+            return x  # Explicitly return x in case fit fails to return its target
+
+        def convertArg(arg, inds):
+            if arg is None:
+                return None
+            arr = np.asarray(arg)
+            if arr.ndim > 0:
+                return arr[inds]
+            else:  # arg was a scalar, so we shouldn't have converted it
+                return arg
+
+        self._instances = Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=self._verbose)(
+            delayed(fit)(obj,
+                         *[convertArg(arg, inds) for arg in args],
+                         **{arg: convertArg(named_args[arg], inds) for arg in named_args})
+            for obj, inds in zip(self._instances, indices)
+        )
+        return self
+
+    def __getattr__(self, name):
+        """
+        Get proxy attribute that wraps the corresponding attribute with the same name from the wrapped object.
+
+        Additionally, the suffix "_interval" is supported for getting an interval instead of a point estimate.
+        """
+
+        # don't proxy special methods
+        if name.startswith('__'):
+            raise AttributeError(name)
+
+        def proxy(make_call, name, summary):
+            def summarize_with(f):
+                results = np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=self._verbose)(
+                    (f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name)
+                return summary(*results)
+            if make_call:
+                def call(*args, **kwargs):
+                    return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
+                return call
+            else:
+                return summarize_with(lambda obj, name: getattr(obj, name))
+
+        def get_mean():
+            # for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
+            return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))
+
+        def get_std():
+            prefix = name[: - len('_std')]
+            return proxy(callable(getattr(self._instances[0], prefix)), prefix,
+                         lambda arr, _: np.std(arr, axis=0))
+
+        def get_interval():
+            # if the attribute exists on the wrapped object once we remove the suffix,
+            # then we should be computing a confidence interval for the wrapped calls
+            prefix = name[: - len("_interval")]
+
+            def call_with_bounds(can_call, lower, upper):
+                def percentile_bootstrap(arr, _):
+                    return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)
+
+                def pivot_bootstrap(arr, est):
+                    return 2 * est - np.percentile(arr, upper, axis=0), 2 * est - np.percentile(arr, lower, axis=0)
+
+                def normal_bootstrap(arr, est):
+                    std = np.std(arr, axis=0)
+                    return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
+
+                # TODO: studentized bootstrap? this would be more accurate in most cases but can we avoid
+                #       second level bootstrap which would be prohibitive computationally?
+
+                fn = {'percentile': percentile_bootstrap,
+                      'normal': normal_bootstrap,
+                      'pivot': pivot_bootstrap}[self._bootstrap_type]
+                return proxy(can_call, prefix, fn)
+
+            can_call = callable(getattr(self._instances[0], prefix))
+            if can_call:
+                # collect extra arguments and pass them through, if the wrapped attribute was callable
+                def call(*args, lower=5, upper=95, **kwargs):
+                    return call_with_bounds(can_call, lower, upper)(*args, **kwargs)
+                return call
+            else:
+                # don't pass extra arguments if the wrapped attribute wasn't callable to begin with
+                def call(lower=5, upper=95):
+                    return call_with_bounds(can_call, lower, upper)
+                return call
+
+        def get_inference():
+            # can't import from econml.inference at top level without creating cyclical dependencies
+            from ._inference import EmpiricalInferenceResults, NormalInferenceResults
+            from .._cate_estimator import LinearModelFinalCateEstimatorDiscreteMixin
+
+            prefix = name[: - len("_inference")]
+
+            def fname_transformer(x):
+                return x
+
+            if prefix in ['const_marginal_effect', 'marginal_effect', 'effect']:
+                inf_type = 'effect'
+            elif prefix == 'coef_':
+                inf_type = 'coefficient'
+                if (hasattr(self._instances[0], 'cate_feature_names') and
+                        callable(self._instances[0].cate_feature_names)):
+                    def fname_transformer(x):
+                        return self._instances[0].cate_feature_names(x)
+            elif prefix == 'intercept_':
+                inf_type = 'intercept'
+            else:
+                raise AttributeError("Unsupported inference: " + name)
+
+            d_t = self._wrapped._d_t[0] if self._wrapped._d_t else 1
+            if prefix == 'effect' or (isinstance(self._wrapped, LinearModelFinalCateEstimatorDiscreteMixin) and
+                                      (inf_type == 'coefficient' or inf_type == 'intercept')):
+                d_t = 1
+            d_y = self._wrapped._d_y[0] if self._wrapped._d_y else 1
+
+            can_call = callable(getattr(self._instances[0], prefix))
+
+            kind = self._bootstrap_type
+            if kind == 'percentile' or kind == 'pivot':
+                def get_dist(est, arr):
+                    if kind == 'percentile':
+                        return arr
+                    elif kind == 'pivot':
+                        return 2 * est - arr
+                    else:
+                        raise ValueError("Invalid kind, must be either 'percentile' or 'pivot'")
+
+                def get_result():
+                    return proxy(can_call, prefix,
+                                 lambda arr, est: EmpiricalInferenceResults(
+                                     d_t=d_t, d_y=d_y,
+                                     pred=est, pred_dist=get_dist(est, arr),
+                                     inf_type=inf_type,
+                                     fname_transformer=fname_transformer,
+                                     **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None))
+
+                # Note that inference results are always methods even if the inference is for a property
+                # (e.g. coef__inference() is a method but coef_ is a property)
+                # Therefore we must insert a lambda if getting inference for a non-callable
+                return get_result() if can_call else get_result
+
+            else:
+                assert kind == 'normal'
+
+                def normal_inference(*args, **kwargs):
+                    pred = getattr(self._wrapped, prefix)
+                    if can_call:
+                        pred = pred(*args, **kwargs)
+                    stderr = getattr(self, prefix + '_std')
+                    if can_call:
+                        stderr = stderr(*args, **kwargs)
+                    return NormalInferenceResults(
+                        d_t=d_t, d_y=d_y, pred=pred,
+                        pred_stderr=stderr, inf_type=inf_type,
+                        fname_transformer=fname_transformer,
+                        **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None)
+
+                # If inference is for a property, create a fresh lambda to avoid passing args through
+                return normal_inference if can_call else lambda: normal_inference()
+
+        caught = None
+        m = None
+        if name.endswith("_interval"):
+            m = get_interval
+        elif name.endswith("_std"):
+            m = get_std
+        elif name.endswith("_inference"):
+            m = get_inference
+
+        # try to get interval/std first if appropriate,
+        # since we don't prefer a wrapped method with this name
+        if m is not None:
+            try:
+                return m()
+            except AttributeError as err:
+                caught = err
+        if self._compute_means:
+            return get_mean()
+
+        raise (caught if caught else AttributeError(name))
--- a/econml/inference/_inference.py
+++ b/econml/inference/_inference.py
@ -11,12 +11,12 @@ import scipy
 from scipy.stats import norm
 from statsmodels.iolib.table import SimpleTable

-from .bootstrap import BootstrapEstimator
-from .sklearn_extensions.linear_model import StatsModelsLinearRegression
-from .utilities import (Summary, _safe_norm_ppf, broadcast_unit_treatments,
-                        cross_product, inverse_onehot, ndim,
-                        parse_final_model_params,
-                        reshape_treatmentwise_effects, shape)
+from ._bootstrap import BootstrapEstimator
+from ..sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ..utilities import (Summary, _safe_norm_ppf, broadcast_unit_treatments,
+                         cross_product, inverse_onehot, ndim,
+                         parse_final_model_params,
+                         reshape_treatmentwise_effects, shape)

 """Options for performing inference in estimators."""

@ -68,6 +68,9 @@ class BootstrapInference(Inference):
    n_jobs: int, optional (default -1)
        The maximum number of concurrently running jobs, as in joblib.Parallel.

+    verbose: int, default: 0
+        Verbosity level
+
    bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
        Bootstrap method used to compute results.
        'percentile' will result in using the empiracal CDF of the replicated computations of the statistics.
@ -76,14 +79,15 @@ class BootstrapInference(Inference):
        'normal' will instead compute a pivot interval assuming the replicates are normally distributed.
    """

-    def __init__(self, n_bootstrap_samples=100, n_jobs=-1, bootstrap_type='pivot'):
+    def __init__(self, n_bootstrap_samples=100, n_jobs=-1, bootstrap_type='pivot', verbose=0):
        self._n_bootstrap_samples = n_bootstrap_samples
        self._n_jobs = n_jobs
        self._bootstrap_type = bootstrap_type
+        self._verbose = verbose

    def fit(self, estimator, *args, **kwargs):
        est = BootstrapEstimator(estimator, self._n_bootstrap_samples, self._n_jobs, compute_means=False,
-                                 bootstrap_type=self._bootstrap_type)
+                                 bootstrap_type=self._bootstrap_type, verbose=self._verbose)
        est.fit(*args, **kwargs)
        self._est = est
        self._d_t = estimator._d_t
--- a/econml/iv/init.py
+++ b/econml/iv/init.py
@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+__all__ = ["dml", "dr", "nnet", "sieve"]
--- a/econml/iv/_nuisance_wrappers.py
+++ b/econml/iv/_nuisance_wrappers.py
@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+from sklearn.base import clone
+from ..utilities import (add_intercept, fit_with_groups,
+                         hstack, inverse_onehot)
+from ..dml.dml import _FinalWrapper as _DMLFinalWrapper
+
+# A cut-down version of the DML first stage wrapper, since we don't need to support linear first stages
+
+
+class _FirstStageWrapper:
+    def __init__(self, model, discrete_target):
+        self._model = clone(model, safe=False)
+        self._discrete_target = discrete_target
+
+    def _combine(self, X, W, Z, n_samples, fitting=True):
+        # output is
+        #   * a column of ones if X, W, and Z are all None
+        #   * just X or W or Z if both of the others are None
+        #   * hstack([arrs]) for whatever subset are not None otherwise
+
+        # ensure Z is 2D
+        if Z is not None:
+            Z = Z.reshape(n_samples, -1)
+
+        if X is None and W is None and Z is None:
+            return np.ones((n_samples, 1))
+
+        arrs = [arr for arr in [X, W, Z] if arr is not None]
+
+        if len(arrs) == 1:
+            return arrs[0]
+        else:
+            return hstack(arrs)
+
+    def fit(self, *, X, W, Target, Z=None, sample_weight=None, groups=None):
+        if self._discrete_target:
+            # In this case, the Target is the one-hot-encoding of the treatment variable
+            # We need to go back to the label representation of the one-hot so as to call
+            # the classifier.
+            if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
+                raise AttributeError("Provided crossfit folds contain training splits that " +
+                                     "don't contain all treatments")
+            Target = inverse_onehot(Target)
+
+        if sample_weight is not None:
+            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
+                            groups=groups, sample_weight=sample_weight)
+        else:
+            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
+                            groups=groups)
+
+    def score(self, *, X, W, Target, Z=None, sample_weight=None):
+        if hasattr(self._model, 'score'):
+            if self._discrete_target:
+                # In this case, the Target is the one-hot-encoding of the treatment variable
+                # We need to go back to the label representation of the one-hot so as to call
+                # the classifier.
+                if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
+                    raise AttributeError("Provided crossfit folds contain training splits that " +
+                                         "don't contain all treatments")
+                Target = inverse_onehot(Target)
+
+            if sample_weight is not None:
+                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target, sample_weight=sample_weight)
+            else:
+                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target)
+        else:
+            return None
+
+    def predict(self, X, W, Z=None):
+        arrs = [arr for arr in [X, W, Z] if arr is not None]
+        n_samples = arrs[0].shape[0] if arrs else 1
+        if self._discrete_target:
+            return self._model.predict_proba(self._combine(X, W, Z, n_samples, fitting=False))[:, 1:]
+        else:
+            return self._model.predict(self._combine(X, W, Z, n_samples, fitting=False))
+
+
+class _FinalWrapper(_DMLFinalWrapper):
+    pass
--- a/econml/iv/dml/init.py
+++ b/econml/iv/dml/init.py
@ -0,0 +1,21 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+from ._dml import DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV
+
+__all__ = ["DMLATEIV",
+           "ProjectedDMLATEIV",
+           "DMLIV",
+           "NonParamDMLIV"]
--- a/econml/iv/dml/_dml.py
+++ b/econml/iv/dml/_dml.py
@ -0,0 +1,931 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+from ..._ortho_learner import _OrthoLearner
+from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
+from ...inference import StatsModelsInference
+from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ...utilities import _deprecate_positional
+from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
+
+
+class _BaseDMLATEIVModelFinal:
+    def __init__(self):
+        self._first_stage = LinearRegression(fit_intercept=False)
+        self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False),
+                                          fit_cate_intercept=True, featurizer=None, use_weight_trick=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res, Z_res = nuisances
+        if Z_res.ndim == 1:
+            Z_res = Z_res.reshape(-1, 1)
+        # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res
+        T_res_pred = self._first_stage.fit(Z_res, T_res,
+                                           sample_weight=sample_weight).predict(Z_res)
+        # TODO: allow the final model to actually use X? Then we'd need to rename the class
+        #       since we would actually be calculating a CATE rather than ATE.
+        self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight)
+        return self
+
+    def predict(self, X=None):
+        # TODO: allow the final model to actually use X?
+        return self._model_final.predict(X=None)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res, Z_res = nuisances
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        # TODO: allow the final model to actually use X?
+        effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+        else:
+            return np.mean((Y_res - Y_res_pred) ** 2)
+
+
+class _BaseDMLATEIV(_OrthoLearner):
+    def __init__(self, discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLATEIVModelFinal()
+
+    @_deprecate_positional("W and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing W and Z by position.", ['W', 'Z'])
+    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLATEIV instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, W=W, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, W=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce X=None and improve the docstring
+        return super().score(Y, T, W=W, Z=Z)
+
+
+class _DMLATEIVModelNuisance:
+    def __init__(self, model_Y_W, model_T_W, model_Z_W):
+        self._model_Y_W = clone(model_Y_W, safe=False)
+        self._model_T_W = clone(model_T_W, safe=False)
+        self._model_Z_W = clone(model_Z_W, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        assert X is None, "DML ATE IV does not accept features"
+        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_Z_W.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        if hasattr(self._model_Y_W, 'score'):
+            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_W, 'score'):
+            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_Z_W, 'score'):
+            Z_X_score = self._model_Z_W.score(X=X, W=W, Target=Z, sample_weight=sample_weight)
+        else:
+            Z_X_score = None
+        return Y_X_score, T_X_score, Z_X_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        Y_pred = self._model_Y_W.predict(X=X, W=W)
+        T_pred = self._model_T_W.predict(X=X, W=W)
+        Z_pred = self._model_Z_W.predict(X=X, W=W)
+        if W is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
+            Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - T_pred.reshape(T.shape)
+        Z_res = Z - Z_pred.reshape(Z.shape)
+        return Y_res, T_res, Z_res
+
+
+class DMLATEIV(_BaseDMLATEIV):
+    """
+    Implementation of the orthogonal/double ml method for ATE estimation with
+    IV as described in
+
+    Double/Debiased Machine Learning for Treatment and Causal Parameters
+    Victor Chernozhukov, Denis Chetverikov, Mert Demirer, Esther Duflo, Christian Hansen, Whitney Newey, James Robins
+    https://arxiv.org/abs/1608.00060
+
+    Requires that either co-variance of T, Z is independent of X or that effect
+    is not heterogeneous in X for correct recovery. Otherwise it estimates
+    a biased ATE.
+    """
+
+    def __init__(self, *,
+                 model_Y_W,
+                 model_T_W,
+                 model_Z_W,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_Z_W = clone(model_Z_W, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _DMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_Z_W=_FirstStageWrapper(clone(self.model_Z_W, safe=False), discrete_target=self.discrete_instrument))
+
+
+class _ProjectedDMLATEIVModelNuisance:
+
+    def __init__(self, model_Y_W, model_T_W, model_T_WZ):
+        self._model_Y_W = clone(model_Y_W, safe=False)
+        self._model_T_W = clone(model_T_W, safe=False)
+        self._model_T_WZ = clone(model_T_WZ, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        assert X is None, "DML ATE IV does not accept features"
+        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_T_WZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        if hasattr(self._model_Y_W, 'score'):
+            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_W, 'score'):
+            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_T_WZ, 'score'):
+            T_XZ_score = self._model_T_WZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        return Y_X_score, T_X_score, T_XZ_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        Y_pred = self._model_Y_W.predict(X, W)
+        TX_pred = self._model_T_W.predict(X, W)
+        TXZ_pred = self._model_T_WZ.predict(X, W, Z)
+        if W is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - TX_pred.reshape(T.shape)
+        Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+        return Y_res, T_res, Z_res
+
+
+class ProjectedDMLATEIV(_BaseDMLATEIV):
+
+    def __init__(self, *,
+                 model_Y_W,
+                 model_T_W,
+                 model_T_WZ,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_T_WZ = clone(model_T_WZ, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ProjectedDMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_T_WZ=_FirstStageWrapper(clone(self.model_T_WZ, safe=False),
+                                          discrete_target=self.discrete_treatment))
+
+
+class _BaseDMLIVModelNuisance:
+    """
+    Nuisance model fits the three models at fit time and at predict time
+    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ):
+        self._model_Y_X = clone(model_Y_X, safe=False)
+        self._model_T_X = clone(model_T_X, safe=False)
+        self._model_T_XZ = clone(model_T_XZ, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        # TODO: would it be useful to extend to handle controls ala vanilla DML?
+        assert W is None, "DML IV does not accept controls"
+        self._model_Y_X.fit(X=X, W=None, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_X.fit(X=X, W=None, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_T_XZ.fit(X=X, W=None, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert W is None, "DML IV does not accept controls"
+        if hasattr(self._model_Y_X, 'score'):
+            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_X, 'score'):
+            T_X_score = self._model_T_X.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_T_XZ, 'score'):
+            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        return Y_X_score, T_X_score, T_XZ_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert W is None, "DML IV does not accept controls"
+        Y_pred = self._model_Y_X.predict(X, W)
+        TXZ_pred = self._model_T_XZ.predict(X, W, Z)
+        TX_pred = self._model_T_X.predict(X, W)
+        if X is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+        return Y_res, T_res
+
+
+class _BaseDMLIVModelFinal:
+    """
+    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+    that depends on X, i.e.
+
+        .. math ::
+            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+    residual on residual regression.
+    """
+
+    def __init__(self, model_final):
+        self._model_final = clone(model_final, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res = nuisances
+        self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var)
+        return self
+
+    def predict(self, X=None):
+        return self._model_final.predict(X)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res = nuisances
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+        else:
+            return np.mean((Y_res - Y_res_pred)**2)
+
+
+class _BaseDMLIV(_OrthoLearner):
+    """
+    The class _BaseDMLIV implements the base class of the DMLIV
+    algorithm for estimating a CATE. It accepts three generic machine
+    learning models:
+    1) model_Y_X that estimates :math:`\\E[Y | X]`
+    2) model_T_X that estimates :math:`\\E[T | X]`
+    3) model_T_XZ that estimates :math:`\\E[T | X, Z]`
+    These are estimated in a cross-fitting manner for each sample in the training set.
+    Then it minimizes the square loss:
+
+    .. math::
+        \\sum_i (Y_i - \\E[Y|X_i] - \theta(X) * (\\E[T|X_i, Z_i] - \\E[T|X_i]))^2
+
+    This loss is minimized by the model_final class, which is passed as an input.
+    In the two children classes {DMLIV, GenericDMLIV}, we implement different strategies of how to invoke
+    machine learning algorithms to minimize this final square loss.
+
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and `predict` methods
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit(X, Z, T, *, sample_weights)`
+        and `predict(X, Z)` methods.
+
+    model_final : estimator
+        final model that at fit time takes as input :math:`(Y-\\E[Y|X])`, :math:`(\\E[T|X,Z]-\\E[T|X])` and X
+        and supports method predict(X) that produces the CATE at X
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None, mc_agg='mean',
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    @_deprecate_positional("Z and X should be passed by keyword only. In a future release "
+                           "we will disallow passing Z and X by position.", ['X', 'Z'])
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLIV
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, X=X, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, X=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().score(Y, T, X=X, Z=Z)
+
+    @property
+    def original_featurizer(self):
+        return self.ortho_learner_model_final_._model_final._original_featurizer
+
+    @property
+    def featurizer_(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return self.ortho_learner_model_final_._model_final._featurizer
+
+    @property
+    def model_final_(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return self.ortho_learner_model_final_._model_final._model
+
+    @property
+    def model_cate(self):
+        """
+        Get the fitted final CATE model.
+
+        Returns
+        -------
+        model_cate: object of type(model_final)
+            An instance of the model_final object that was fitted after calling fit which corresponds
+            to the constant marginal CATE model.
+        """
+        return self.ortho_learner_model_final_._model_final._model
+
+    @property
+    def models_Y_X(self):
+        """
+        Get the fitted models for :math:`\\E[Y | X]`.
+
+        Returns
+        -------
+        models_Y_X: list of objects of type(`model_Y_X`)
+            A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_X(self):
+        """
+        Get the fitted models for :math:`\\E[T | X]`.
+
+        Returns
+        -------
+        models_T_X: list of objects of type(`model_T_X`)
+            A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_T_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_XZ(self):
+        """
+        Get the fitted models for :math:`\\E[T | X, Z]`.
+
+        Returns
+        -------
+        models_T_XZ: list of objects of type(`model_T_XZ`)
+            A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_Y_X(self):
+        """
+        Get the scores for Y_X model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_T_X(self):
+        """
+        Get the scores for T_X model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[1]
+
+    @property
+    def nuisance_scores_T_XZ(self):
+        """
+        Get the scores for T_XZ model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[2]
+
+    def cate_feature_names(self, feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        feature_names: list of strings of length X.shape[1] or None
+            The names of the input features. If None and X is a dataframe, it defaults to the column names
+            from the dataframe.
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final constant marginal CATE model is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
+            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
+        """
+        if feature_names is None:
+            feature_names = self._input_names["feature_names"]
+        if self.original_featurizer is None:
+            return feature_names
+        elif hasattr(self.original_featurizer, 'get_feature_names'):
+            return self.original_featurizer.get_feature_names(feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+
+class DMLIV(LinearModelFinalCateEstimatorMixin, _BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that specifies a particular effect model
+    where the treatment effect is linear in some featurization of the variable X
+    The features are created by a provided featurizer that supports fit_transform.
+    Then an arbitrary model fits on the composite set of features.
+
+    Concretely, it assumes that :math:`\\theta(X)=<\\theta, \\phi(X)>` for some features :math:`\\phi(X)`
+    and runs a linear model regression of :math:`Y-\\E[Y|X]` on :math:`phi(X)*(\\E[T|X,Z]-\\E[T|X])`.
+    The features are created by the featurizer provided by the user. The particular
+    linear model regression is also specified by the user (e.g. Lasso, ElasticNet)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_final : estimator
+        final linear model for predicting :math:`(Y-\\E[Y|X])` from :math:`\\phi(X) \\cdot (\\E[T|X,Z]-\\E[T|X])`
+        Method is incorrect if this model is not linear (e.g. Lasso, ElasticNet, LinearRegression).
+
+    featurizer: :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_X,
+                 model_T_XZ,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 discrete_instrument=False, discrete_treatment=False,
+                 categories='auto', random_state=None):
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_T_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=False))
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
+
+
+class NonParamDMLIV(_BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that allows for an arbitrary square loss based ML
+    method in the final stage of the DMLIV algorithm. The method has to support
+    sample weights and the fit method has to take as input sample_weights (e.g. random forests), i.e.
+    fit(X, y, sample_weight=None)
+    It achieves this by re-writing the final stage square loss of the DMLIV algorithm as:
+
+    .. math ::
+        \\sum_i (\\E[T|X_i, Z_i] - \\E[T|X_i])^2 * ((Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i]) - \\theta(X))^2
+
+    Then this can be viewed as a weighted square loss regression, where the target label is
+
+    .. math ::
+        \\tilde{Y}_i = (Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i])
+
+    and each sample has a weight of
+
+    .. math ::
+        V(X_i) = (\\E[T|X_i, Z_i] - \\E[T|X_i])^2
+
+    Thus we can call any regression model with inputs:
+
+        fit(X, :math:`\\tilde{Y}_i`, sample_weight= :math:`V(X_i)`)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_final : estimator
+        final model for predicting :math:`\\tilde{Y}` from X with sample weights V(X)
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_X,
+                 model_T_XZ,
+                 model_final,
+                 featurizer=None,
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 random_state=None):
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_T_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        super().__init__(cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=False,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=True))
--- a/econml/iv/dr/init.py
+++ b/econml/iv/dr/init.py
@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+from ._dr import IntentToTreatDRIV, LinearIntentToTreatDRIV
+
+__all__ = ["IntentToTreatDRIV",
+           "LinearIntentToTreatDRIV"]
--- a/econml/iv/dr/_dr.py
+++ b/econml/iv/dr/_dr.py
@ -0,0 +1,795 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+from ..._ortho_learner import _OrthoLearner
+from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
+from ...inference import StatsModelsInference
+from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs,
+                          inverse_onehot)
+from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
+
+
+class _BaseDRIVModelFinal:
+    """
+    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+    that depends on X, i.e.
+
+        .. math ::
+            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+    residual on residual regression.
+    """
+
+    def __init__(self, model_final, featurizer,
+                 discrete_treatment, discrete_instrument,
+                 fit_cate_intercept, cov_clip, opt_reweighted):
+        self._model_final = clone(model_final, safe=False)
+        self._fit_cate_intercept = fit_cate_intercept
+        self._original_featurizer = clone(featurizer, safe=False)
+        self._discrete_treatment = discrete_treatment
+        self._discrete_instrument = discrete_instrument
+        if self._fit_cate_intercept:
+            add_intercept_trans = FunctionTransformer(add_intercept,
+                                                      validate=True)
+            if featurizer:
+                self._featurizer = Pipeline([('featurize', self._original_featurizer),
+                                             ('add_intercept', add_intercept_trans)])
+            else:
+                self._featurizer = add_intercept_trans
+        else:
+            self._featurizer = self._original_featurizer
+        self._cov_clip = cov_clip
+        self._opt_reweighted = opt_reweighted
+
+    def _effect_estimate(self, nuisances):
+        prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances]
+
+        # Estimate final model of theta(X) by minimizing the square loss:
+        # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2
+        # We clip the covariance so that it is bounded away from zero, so as to reduce variance
+        # at the expense of some small bias. For points with very small covariance we revert
+        # to the model-based preliminary estimate and do not add the correction term.
+        cov_sign = np.sign(cov)
+        cov_sign[cov_sign == 0] = 1
+        clipped_cov = cov_sign * np.clip(np.abs(cov),
+                                         self._cov_clip, np.inf)
+        return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        self.d_y = Y.shape[1:]
+        self.d_t = nuisances[1].shape[1:]
+        self.d_z = nuisances[3].shape[1:]
+
+        # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments,
+        #       instruments, and outcomes
+        if self.d_y and self.d_y[0] > 2:
+            raise AttributeError("DRIV only supports a single outcome")
+
+        if self.d_t and self.d_t[0] > 1:
+            if self._discrete_treatment:
+                raise AttributeError("DRIV only supports binary treatments")
+            else:
+                raise AttributeError("DRIV only supports single-dimensional continuous treatments")
+
+        if self.d_z and self.d_z[0] > 1:
+            if self._discrete_instrument:
+                raise AttributeError("DRIV only supports binary instruments")
+            else:
+                raise AttributeError("DRIV only supports single-dimensional continuous instruments")
+
+        theta_dr, clipped_cov = self._effect_estimate(nuisances)
+
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.fit_transform(X)
+        if self._opt_reweighted and (sample_weight is not None):
+            sample_weight = sample_weight * clipped_cov.ravel()**2
+        elif self._opt_reweighted:
+            sample_weight = clipped_cov.ravel()**2
+        self._model_final.fit(X, theta_dr, **filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var))
+
+        return self
+
+    def predict(self, X=None):
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+        return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        theta_dr, clipped_cov = self._effect_estimate(nuisances)
+
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+
+        if self._opt_reweighted and (sample_weight is not None):
+            sample_weight = sample_weight * clipped_cov.ravel()**2
+        elif self._opt_reweighted:
+            sample_weight = clipped_cov.ravel()**2
+
+        return np.average((theta_dr.ravel() - self._model_final.predict(X).ravel())**2,
+                          weights=sample_weight, axis=0)
+
+
+class _BaseDRIV(_OrthoLearner):
+
+    """
+    The _BaseDRIV algorithm for estimating CATE with IVs. It is the parent of the
+    two public classes {DRIV, ProjectedDRIV}
+
+    Parameters
+    ----------
+    nuisance_models : dictionary of nuisance models, with {'name_of_model' : EstimatorObject, ...}
+
+    model_final : estimator
+        model compatible with the sklearn regression API, used to fit the effect on X
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        model_final.fit must accept sample_weight as a kw argument. If True then
+        assumes the model_final is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_final space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=0.1,
+                 opt_reweighted=False,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        self.cov_clip = cov_clip
+        self.opt_reweighted = opt_reweighted
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDRIVModelFinal(self._gen_model_final(),
+                                   clone(self.featurizer, safe=False),
+                                   self.discrete_treatment,
+                                   self.discrete_instrument,
+                                   self.fit_cate_intercept,
+                                   self.cov_clip,
+                                   self.opt_reweighted)
+
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDRIV instance
+        """
+        # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring
+        return super().fit(Y, T, X=X, W=W, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calls the score function of the model_final and returns the calculated score.
+        The model_final model must have a score method.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix or None (Default=None)
+            Instruments for each sample
+        X: optional (n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+
+        Returns
+        -------
+        score : float or (array of float)
+            The score of the final CATE model on the new data. Same type as the return
+            type of the model_final.score method.
+        """
+        # Replacing score from _OrthoLearner, to reorder arguments and improve the docstring
+        return super().score(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight)
+
+    @property
+    def original_featurizer(self):
+        return self.ortho_learner_model_final_._original_featurizer
+
+    @property
+    def featurizer_(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return self.ortho_learner_model_final_._featurizer
+
+    @property
+    def model_final_(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return self.ortho_learner_model_final_._model_final
+
+    def cate_feature_names(self, feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        feature_names: list of strings of length X.shape[1] or None
+            The names of the input features. If None and X is a dataframe, it defaults to the column names
+            from the dataframe.
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final constant marginal CATE model is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
+            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
+        """
+        if feature_names is None:
+            feature_names = self._input_names["feature_names"]
+        if self.original_featurizer is None:
+            return feature_names
+        elif hasattr(self.original_featurizer, 'get_feature_names'):
+            return self.original_featurizer.get_feature_names(feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+
+class _IntentToTreatDRIVModelNuisance:
+    """
+    Nuisance model fits the three models at fit time and at predict time
+    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+    """
+
+    def __init__(self, model_Y_X, model_T_XZ, prel_model_effect):
+        self._model_Y_X = clone(model_Y_X, safe=False)
+        self._model_T_XZ = clone(model_T_XZ, safe=False)
+        self._prel_model_effect = clone(prel_model_effect, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        self._model_Y_X.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_XZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        # we need to undo the one-hot encoding for calling effect,
+        # since it expects raw values
+        self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W,
+                                    sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        if hasattr(self._model_Y_X, 'score'):
+            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_XZ, 'score'):
+            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        if hasattr(self._prel_model_effect, 'score'):
+            # we need to undo the one-hot encoding for calling effect,
+            # since it expects raw values
+            effect_score = self._prel_model_effect.score(Y, inverse_onehot(T),
+                                                         Z=inverse_onehot(Z), X=X, W=W, sample_weight=sample_weight)
+        else:
+            effect_score = None
+
+        return Y_X_score, T_XZ_score, effect_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        Y_pred = self._model_Y_X.predict(X, W)
+        T_pred_zero = self._model_T_XZ.predict(X, W, np.zeros(Z.shape))
+        T_pred_one = self._model_T_XZ.predict(X, W, np.ones(Z.shape))
+        delta = (T_pred_one - T_pred_zero) / 2
+        T_pred_mean = (T_pred_one + T_pred_zero) / 2
+        prel_theta = self._prel_model_effect.effect(X)
+        if X is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - T_pred_mean.reshape(T.shape)
+        return prel_theta, T_res, Y_res, 2 * Z - 1, delta
+
+
+class _IntentToTreatDRIV(_BaseDRIV):
+    """
+    Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 prel_model_effect,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 opt_reweighted=False,
+                 categories='auto',
+                 random_state=None):
+        """
+        """
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.prel_model_effect = clone(prel_model_effect, safe=False)
+        # TODO: check that Y, T, Z do not have multiple columns
+        super().__init__(model_final=model_final,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=True,
+                         discrete_treatment=True,
+                         categories=categories,
+                         opt_reweighted=opt_reweighted,
+                         random_state=random_state)
+
+    def _gen_prel_model_effect(self):
+        return clone(self.prel_model_effect, safe=False)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False),
+                                                                  discrete_target=False),
+                                               _FirstStageWrapper(clone(self.model_T_XZ, safe=False),
+                                                                  discrete_target=True),
+                                               self._gen_prel_model_effect())
+
+
+class _DummyCATE:
+    """
+    A dummy cate effect model that always returns zero effect
+    """
+
+    def __init__(self):
+        return
+
+    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None, **kwargs):
+        return self
+
+    def effect(self, X):
+        if X is None:
+            return np.zeros(1)
+        return np.zeros(X.shape[0])
+
+
+class IntentToTreatDRIV(_IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    final_model_effect : estimator, optional
+        a final model for the CATE and projections. If None, then flexible_model_effect is also used as a final model
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    cv: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
+        utilities can be used for any linear model to enable sample_weights). If True then
+        assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_effect space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 flexible_model_effect,
+                 model_final=None,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 opt_reweighted=False,
+                 categories='auto',
+                 random_state=None):
+        self.flexible_model_effect = clone(flexible_model_effect, safe=False)
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         prel_model_effect=None,
+                         model_final=model_final,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         opt_reweighted=opt_reweighted,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        if self.model_final is None:
+            return clone(self.flexible_model_effect, safe=False)
+        return clone(self.model_final, safe=False)
+
+    def _gen_prel_model_effect(self):
+        return _IntentToTreatDRIV(model_Y_X=clone(self.model_Y_X, safe=False),
+                                  model_T_XZ=clone(self.model_T_XZ, safe=False),
+                                  prel_model_effect=_DummyCATE(),
+                                  model_final=clone(self.flexible_model_effect, safe=False),
+                                  cov_clip=1e-7,
+                                  cv=1,
+                                  opt_reweighted=True,
+                                  random_state=self.random_state)
+
+    @property
+    def models_Y_X(self):
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_XZ(self):
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_Y_X(self):
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_T_XZ(self):
+        return self.nuisance_scores_[1]
+
+    @property
+    def nuisance_scores_effect(self):
+        return self.nuisance_scores_[2]
+
+    @property
+    def prel_model_effect(self):
+        return self._gen_prel_model_effect()
+
+    @prel_model_effect.setter
+    def prel_model_effect(self, value):
+        if value is not None:
+            raise ValueError("Parameter `prel_model_effect` cannot be altered for this estimator.")
+
+
+class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    cv: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 flexible_model_effect,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 categories='auto',
+                 random_state=None):
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         flexible_model_effect=flexible_model_effect,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         model_final=None,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         opt_reweighted=False,
+                         categories=categories, random_state=random_state)
+
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
+
+    # override only so that we can update the docstring to indicate support for `StatsModelsInference`
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix or vector of length n
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
+            (or an instance of :class:`.StatsModelsInference`).
+
+        Returns
+        -------
+        self : instance
+        """
+        return super().fit(Y, T, Z=Z, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def refit_final(self, *, inference='auto'):
+        return super().refit_final(inference=inference)
+    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final_._fit_cate_intercept
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.ortho_learner_model_final_._fit_cate_intercept
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, value):
+        if value is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator.")
+
+    @property
+    def opt_reweighted(self):
+        return False
+
+    @opt_reweighted.setter
+    def opt_reweighted(self, value):
+        if not (value is False):
+            raise ValueError("Parameter `value` cannot be altered from `False` for this estimator.")
--- a/econml/iv/nnet/init.py
+++ b/econml/iv/nnet/init.py
@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._deepiv import DeepIV
+
+__all__ = ["DeepIV"]
--- a/econml/iv/nnet/_deepiv.py
+++ b/econml/iv/nnet/_deepiv.py
@ -0,0 +1,455 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Deep IV estimator and related components."""
+
+import numpy as np
+import keras
+from ..._cate_estimator import BaseCateEstimator
+from ...utilities import deprecated
+from keras import backend as K
+import keras.layers as L
+from keras.models import Model
+from econml.utilities import check_input_arrays, _deprecate_positional
+
+# TODO: make sure to use random seeds wherever necessary
+# TODO: make sure that the public API consistently uses "T" instead of "P" for the treatment
+
+# unfortunately with the Theano and Tensorflow backends,
+# the straightforward use of K.stop_gradient can cause an error
+# because the parameters of the intermediate layers are now disconnected from the loss;
+# therefore we add a pointless multiplication by 0 to the values in each of the variables in vs
+# so that those layers remain connected but with 0 gradient
+
+
+def _zero_grad(e, vs):
+    if K.backend() == 'cntk':
+        return K.stop_gradient(e)
+    else:
+        z = 0 * K.sum(K.concatenate([K.batch_flatten(v) for v in vs]))
+        return K.stop_gradient(e) + z
+
+
+def mog_model(n_components, d_x, d_t):
+    """
+    Create a mixture of Gaussians model with the specified number of components.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_x : int
+        The number of dimensions in the layer used as input
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes an input of dimension `d_t` and generates three outputs: pi, mu, and sigma
+
+    """
+    x = L.Input((d_x,))
+    pi = L.Dense(n_components, activation='softmax')(x)
+    mu = L.Reshape((n_components, d_t))(L.Dense(n_components * d_t)(x))
+    log_sig = L.Dense(n_components)(x)
+    sig = L.Lambda(K.exp)(log_sig)
+    return Model([x], [pi, mu, sig])
+
+
+def mog_loss_model(n_components, d_t):
+    """
+    Create a Keras model that computes the loss of a mixture of Gaussians model on data.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes as inputs pi, mu, sigma, and t and generates a single output containing the loss.
+
+    """
+    pi = L.Input((n_components,))
+    mu = L.Input((n_components, d_t))
+    sig = L.Input((n_components,))
+    t = L.Input((d_t,))
+
+    # || t - mu_i || ^2
+    d2 = L.Lambda(lambda d: K.sum(K.square(d), axis=-1),
+                  output_shape=(n_components,))(
+        L.Subtract()([L.RepeatVector(n_components)(t), mu])
+    )
+
+    # LL = C - log(sum(pi_i/sig^d * exp(-d2/(2*sig^2))))
+    # Use logsumexp for numeric stability:
+    # LL = C - log(sum(exp(-d2/(2*sig^2) + log(pi_i/sig^d))))
+    # TODO: does the numeric stability actually make any difference?
+    def make_logloss(d2, sig, pi):
+        return -K.logsumexp(-d2 / (2 * K.square(sig)) + K.log(pi / K.pow(sig, d_t)), axis=-1)
+
+    ll = L.Lambda(lambda dsp: make_logloss(*dsp), output_shape=(1,))([d2, sig, pi])
+
+    m = Model([pi, mu, sig, t], [ll])
+    return m
+
+
+def mog_sample_model(n_components, d_t):
+    """
+    Create a model that generates samples from a mixture of Gaussians.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes as inputs pi, mu, and sigma, and generates a single output containing a sample.
+
+    """
+    pi = L.Input((n_components,))
+    mu = L.Input((n_components, d_t))
+    sig = L.Input((n_components,))
+
+    # CNTK backend can't randomize across batches and doesn't implement cumsum (at least as of June 2018,
+    # see Known Issues on https://docs.microsoft.com/en-us/cognitive-toolkit/Using-CNTK-with-Keras)
+    def sample(pi, mu, sig):
+        batch_size = K.shape(pi)[0]
+        if K.backend() == 'cntk':
+            # generate cumulative sum via matrix multiplication
+            cumsum = K.dot(pi, K.constant(np.triu(np.ones((n_components, n_components)))))
+        else:
+            cumsum = K.cumsum(pi, 1)
+        cumsum_shift = K.concatenate([K.zeros_like(cumsum[:, 0:1]), cumsum])[:, :-1]
+        if K.backend() == 'cntk':
+            import cntk as C
+            # Generate standard uniform values in shape (batch_size,1)
+            #   (since we can't use the dynamic batch_size with random.uniform in CNTK,
+            #    we use uniform_like instead with an input of an appropriate shape)
+            rndSmp = C.random.uniform_like(pi[:, 0:1])
+        else:
+            rndSmp = K.random_uniform((batch_size, 1))
+        cmp1 = K.less_equal(cumsum_shift, rndSmp)
+        cmp2 = K.less(rndSmp, cumsum)
+
+        # convert to floats and multiply to perform equivalent of logical AND
+        rndIndex = K.cast(cmp1, K.floatx()) * K.cast(cmp2, K.floatx())
+
+        if K.backend() == 'cntk':
+            # Generate standard normal values in shape (batch_size,1,d_t)
+            #   (since we can't use the dynamic batch_size with random.normal in CNTK,
+            #    we use normal_like instead with an input of an appropriate shape)
+            rndNorms = C.random.normal_like(mu[:, 0:1, :])  # K.random_normal((1,d_t))
+        else:
+            rndNorms = K.random_normal((batch_size, 1, d_t))
+
+        rndVec = mu + K.expand_dims(sig) * rndNorms
+
+        # exactly one entry should be nonzero for each b,d combination; use sum to select it
+        return K.sum(K.expand_dims(rndIndex) * rndVec, 1)
+
+    # prevent gradient from passing through sampling
+    samp = L.Lambda(lambda pms: _zero_grad(sample(*pms), pms), output_shape=(d_t,))
+    samp.trainable = False
+
+    return Model([pi, mu, sig], samp([pi, mu, sig]))
+
+
+# three options: biased or upper-bound loss require a single number of samples;
+#                unbiased can take different numbers for the network and its gradient
+def response_loss_model(h, p, d_z, d_x, d_y, samples=1, use_upper_bound=False, gradient_samples=0):
+    """
+    Create a Keras model that computes the loss of a response model on data.
+
+    Parameters
+    ----------
+    h : (tensor, tensor) -> Layer
+        Method for building a model of y given p and x
+
+    p : (tensor, tensor) -> Layer
+        Method for building a model of p given z and x
+
+    d_z : int
+        The number of dimensions in z
+
+    d_x :  int
+        Tbe number of dimensions in x
+
+    d_y : int
+        The number of dimensions in y
+
+    samples: int
+        The number of samples to use
+
+    use_upper_bound : bool
+        Whether to use an upper bound to the true loss
+        (equivalent to adding a regularization penalty on the variance of h)
+
+    gradient_samples : int
+        The number of separate additional samples to use when calculating the gradient.
+        This can only be nonzero if user_upper_bound is False, in which case the gradient of
+        the returned loss will be an unbiased estimate of the gradient of the true loss.
+
+    Returns
+    -------
+    A Keras model that takes as inputs z, x, and y and generates a single output containing the loss.
+
+    """
+    assert not(use_upper_bound and gradient_samples)
+
+    # sample: (() -> Layer, int) -> Layer
+    def sample(f, n):
+        assert n > 0
+        if n == 1:
+            return f()
+        else:
+            return L.average([f() for _ in range(n)])
+    z, x, y = [L.Input((d,)) for d in [d_z, d_x, d_y]]
+    if gradient_samples:
+        # we want to separately sample the gradient; we use stop_gradient to treat the sampled model as constant
+        # the overall computation ensures that we have an interpretable loss (y-h̅(p,x))²,
+        # but also that the gradient is -2(y-h̅(p,x))∇h̅(p,x) with *different* samples used for each average
+        diff = L.subtract([y, sample(lambda: h(p(z, x), x), samples)])
+        grad = sample(lambda: h(p(z, x), x), gradient_samples)
+
+        def make_expr(grad, diff):
+            return K.stop_gradient(diff) * (K.stop_gradient(diff + 2 * grad) - 2 * grad)
+        expr = L.Lambda(lambda args: make_expr(*args))([grad, diff])
+    elif use_upper_bound:
+        expr = sample(lambda: L.Lambda(K.square)(L.subtract([y, h(p(z, x), x)])), samples)
+    else:
+        expr = L.Lambda(K.square)(L.subtract([y, sample(lambda: h(p(z, x), x), samples)]))
+    return Model([z, x, y], [expr])
+
+
+class DeepIV(BaseCateEstimator):
+    """
+    The Deep IV Estimator (see http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf).
+
+    Parameters
+    ----------
+    n_components : int
+        Number of components in the mixture density network
+
+    m : (tensor, tensor) -> Layer
+        Method for building a Keras model that featurizes the z and x inputs
+
+    h : (tensor, tensor) -> Layer
+        Method for building a model of y given t and x
+
+    n_samples : int
+        The number of samples to use
+
+    use_upper_bound_loss : bool, optional
+        Whether to use an upper bound to the true loss
+        (equivalent to adding a regularization penalty on the variance of h).
+        Defaults to False.
+
+    n_gradient_samples : int, optional
+        The number of separate additional samples to use when calculating the gradient.
+        This can only be nonzero if user_upper_bound is False, in which case the gradient of
+        the returned loss will be an unbiased estimate of the gradient of the true loss.
+        Defaults to 0.
+
+    optimizer : string, optional
+        The optimizer to use. Defaults to "adam"
+
+    first_stage_options : dictionary, optional
+        The keyword arguments to pass to Keras's `fit` method when training the first stage model.
+        Defaults to `{"epochs": 100}`.
+
+    second_stage_options : dictionary, optional
+        The keyword arguments to pass to Keras's `fit` method when training the second stage model.
+        Defaults to `{"epochs": 100}`.
+
+    """
+
+    def __init__(self, *,
+                 n_components,
+                 m,
+                 h,
+                 n_samples, use_upper_bound_loss=False, n_gradient_samples=0,
+                 optimizer='adam',
+                 first_stage_options={"epochs": 100},
+                 second_stage_options={"epochs": 100}):
+        self._n_components = n_components
+        self._m = m
+        self._h = h
+        self._n_samples = n_samples
+        self._use_upper_bound_loss = use_upper_bound_loss
+        self._n_gradient_samples = n_gradient_samples
+        self._optimizer = optimizer
+        self._first_stage_options = first_stage_options
+        self._second_stage_options = second_stage_options
+        super().__init__()
+
+    @_deprecate_positional("X and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X and Z by position.", ['X', 'Z'])
+    @BaseCateEstimator._wrap_fit
+    def fit(self, Y, T, X, Z, *, inference=None):
+        """Estimate the counterfactual model from data.
+
+        That is, estimate functions τ(·, ·, ·), ∂τ(·, ·).
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n × dₜ) matrix or vector of length n
+            Treatments for each sample
+        X: (n × dₓ) matrix
+            Features for each sample
+        Z: (n × d_z) matrix
+            Instruments for each sample
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`)
+
+        Returns
+        -------
+        self
+
+        """
+        Y, T, X, Z = check_input_arrays(Y, T, X, Z)
+        assert 1 <= np.ndim(X) <= 2
+        assert 1 <= np.ndim(Z) <= 2
+        assert 1 <= np.ndim(T) <= 2
+        assert 1 <= np.ndim(Y) <= 2
+        assert np.shape(X)[0] == np.shape(Y)[0] == np.shape(T)[0] == np.shape(Z)[0]
+
+        # in case vectors were passed for Y or T, keep track of trailing dims for reshaping effect output
+
+        d_x, d_y, d_z, d_t = [np.shape(a)[1] if np.ndim(a) > 1 else 1 for a in [X, Y, Z, T]]
+        x_in, y_in, z_in, t_in = [L.Input((d,)) for d in [d_x, d_y, d_z, d_t]]
+        n_components = self._n_components
+
+        treatment_network = self._m(z_in, x_in)
+
+        # the dimensionality of the output of the network
+        # TODO: is there a more robust way to do this?
+        d_n = K.int_shape(treatment_network)[-1]
+
+        pi, mu, sig = mog_model(n_components, d_n, d_t)([treatment_network])
+
+        ll = mog_loss_model(n_components, d_t)([pi, mu, sig, t_in])
+
+        model = Model([z_in, x_in, t_in], [ll])
+        model.add_loss(L.Lambda(K.mean)(ll))
+        model.compile(self._optimizer)
+        # TODO: do we need to give the user more control over other arguments to fit?
+        model.fit([Z, X, T], [], **self._first_stage_options)
+
+        lm = response_loss_model(lambda t, x: self._h(t, x),
+                                 lambda z, x: Model([z_in, x_in],
+                                                    # subtle point: we need to build a new model each time,
+                                                    # because each model encapsulates its randomness
+                                                    [mog_sample_model(n_components, d_t)([pi, mu, sig])])([z, x]),
+                                 d_z, d_x, d_y,
+                                 self._n_samples, self._use_upper_bound_loss, self._n_gradient_samples)
+
+        rl = lm([z_in, x_in, y_in])
+        response_model = Model([z_in, x_in, y_in], [rl])
+        response_model.add_loss(L.Lambda(K.mean)(rl))
+        response_model.compile(self._optimizer)
+        # TODO: do we need to give the user more control over other arguments to fit?
+        response_model.fit([Z, X, Y], [], **self._second_stage_options)
+
+        self._effect_model = Model([t_in, x_in], [self._h(t_in, x_in)])
+
+        # TODO: it seems like we need to sum over the batch because we can only apply gradient to a scalar,
+        #       not a general tensor (because of how backprop works in every framework)
+        #       (alternatively, we could iterate through the batch in addition to iterating through the output,
+        #       but this seems annoying...)
+        #       Therefore, it's important that we use a batch size of 1 when we call predict with this model
+        def calc_grad(t, x):
+            h = self._h(t, x)
+            all_grads = K.concatenate([g
+                                       for i in range(d_y)
+                                       for g in K.gradients(K.sum(h[:, i]), [t])])
+            return K.reshape(all_grads, (-1, d_y, d_t))
+
+        self._marginal_effect_model = Model([t_in, x_in], L.Lambda(lambda tx: calc_grad(*tx))([t_in, x_in]))
+
+    def effect(self, X=None, T0=0, T1=1):
+        """
+        Calculate the heterogeneous treatment effect τ(·,·,·).
+
+        The effect is calculated between the two treatment points
+        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T0: (m × dₜ) matrix
+            Base treatments for each sample
+        T1: (m × dₜ) matrix
+            Target treatments for each sample
+        X: optional (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        τ: (m × d_y) matrix
+            Heterogeneous treatment effects on each outcome for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+        """
+        X, T0, T1 = check_input_arrays(X, T0, T1)
+        if np.ndim(T0) == 0:
+            T0 = np.repeat(T0, 1 if X is None else np.shape(X)[0])
+        if np.ndim(T1) == 0:
+            T1 = np.repeat(T1, 1 if X is None else np.shape(X)[0])
+        if X is None:
+            X = np.empty((np.shape(T0)[0], 0))
+        return (self._effect_model.predict([T1, X]) - self._effect_model.predict([T0, X])).reshape((-1,) + self._d_y)
+
+    def marginal_effect(self, T, X=None):
+        """
+        Calculate the marginal effect ∂τ(·, ·) around a base treatment point conditional on features.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: optional(m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        grad_tau: (m × d_y × dₜ) array
+            Heterogeneous marginal effects on each outcome for each sample
+            Note that when Y or T is a vector rather than a 2-dimensional array,
+            the corresponding singleton dimensions in the output will be collapsed
+            (e.g. if both are vectors, then the output of this method will also be a vector)
+        """
+        T, X = check_input_arrays(T, X)
+        # TODO: any way to get this to work on batches of arbitrary size?
+        return self._marginal_effect_model.predict([T, X], batch_size=1).reshape((-1,) + self._d_y + self._d_t)
+
+    def predict(self, T, X):
+        """Predict outcomes given treatment assignments and features.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        Y: (m × d_y) matrix
+            Outcomes for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+        """
+        T, X = check_input_arrays(T, X)
+        return self._effect_model.predict([T, X]).reshape((-1,) + self._d_y)
--- a/econml/iv/sieve/init.py
+++ b/econml/iv/sieve/init.py
@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._tsls import HermiteFeatures, DPolynomialFeatures, SieveTSLS
+
+__all__ = ["HermiteFeatures",
+           "DPolynomialFeatures",
+           "SieveTSLS"]
--- a/econml/iv/sieve/_tsls.py
+++ b/econml/iv/sieve/_tsls.py
@ -0,0 +1,362 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Provides a non-parametric two-stage least squares instrumental variable estimator."""
+
+import numpy as np
+from copy import deepcopy
+from sklearn import clone
+from sklearn.linear_model import LinearRegression
+from ...utilities import (shape, transpose, reshape, cross_product, ndim, size,
+                          _deprecate_positional, check_input_arrays)
+from ..._cate_estimator import BaseCateEstimator, LinearCateEstimator
+from numpy.polynomial.hermite_e import hermeval
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import PolynomialFeatures
+from itertools import product
+
+
+class HermiteFeatures(TransformerMixin):
+    """
+    Featurizer that returns(unscaled) Hermite function evaluations.
+
+    The evaluated functions are of degrees 0..`degree`, differentiated `shift` times.
+
+    If the input has shape(n, x) and `joint` is False, the output will have shape(n, (`degree`+ 1)×x) if `shift` is 0.
+    If the input has shape(n, x) and `joint` is True, the output will have shape(n, (`degree`+ 1) ^ x) if `shift` is 0.
+    In either case, if `shift` is nonzero there will be `shift` additional dimensions of size x
+    between the first and last.
+    """
+
+    def __init__(self, degree, shift=0, joint=False):
+        self._degree = degree
+        self._shift = shift
+        self._joint = joint
+
+    def _column_feats(self, X, shift):
+        """
+        Apply Hermite function evaluations of degrees 0..`degree` differentiated `shift` times.
+
+        When applied to the column `X` of shape(n,), the resulting array has shape(n, (degree + 1)).
+        """
+        assert ndim(X) == 1
+        # this will have dimension (d,) + shape(X)
+        coeffs = np.identity(self._degree + shift + 1)[:, shift:]
+        feats = ((-1) ** shift) * hermeval(X, coeffs) * np.exp(-X * X / 2)
+        # send the first dimension to the end
+        return transpose(feats)
+
+    def fit(self, X):
+        """Fits the data(a NOP for this class) and returns self."""
+        return self
+
+    def transform(self, X):
+        """
+        Transform the data by applying the appropriate Hermite functions.
+
+        Parameters
+        ----------
+        X: array_like
+            2-dimensional array of input features
+
+        Returns
+        -------
+        The transformed data
+        """
+        assert ndim(X) == 2
+        n = shape(X)[0]
+        ncols = shape(X)[1]
+        columns = []
+        for indices in product(*[range(ncols) for i in range(self._shift)]):
+            if self._joint:
+                columns.append(cross_product(*[self._column_feats(X[:, i], indices.count(i))
+                                               for i in range(shape(X)[1])]))
+            else:
+                indices = set(indices)
+                if self._shift == 0:  # return features for all columns:
+                    columns.append(np.hstack([self._column_feats(X[:, i], self._shift) for i in range(shape(X)[1])]))
+                # columns are featurized independently; partial derivatives are only non-zero
+                # when taken with respect to the same column each time
+                elif len(indices) == 1:
+                    index = list(indices)[0]
+                    feats = self._column_feats(X[:, index], self._shift)
+                    columns.append(np.hstack([feats if i == index else np.zeros(shape(feats))
+                                              for i in range(shape(X)[1])]))
+                else:
+                    columns.append(np.zeros((n, (self._degree + 1) * ncols)))
+        return reshape(np.hstack(columns), (n,) + (ncols,) * self._shift + (-1,))
+
+
+class DPolynomialFeatures(TransformerMixin):
+    """
+    Featurizer that returns the derivatives of :class:`~sklearn.preprocessing.PolynomialFeatures` features in
+    a way that's compativle with the expectations of :class:`.NonparametricTwoStageLeastSquares`'s
+    `dt_featurizer` parameter.
+
+    If the input has shape `(n, x)` and
+    :meth:`PolynomialFeatures.transform<sklearn.preprocessing.PolynomialFeatures.transform>` returns an output
+    of shape `(n, f)`, then :meth:`.transform` will return an array of shape `(n, x, f)`.
+
+    Parameters
+    ----------
+    degree: integer, default = 2
+        The degree of the polynomial features.
+
+    interaction_only: boolean, default = False
+        If true, only derivatives of interaction features are produced: features that are products of at most degree
+        distinct input features (so not `x[1] ** 2`, `x[0] * x[2] ** 3`, etc.).
+
+    include_bias: boolean, default = True
+        If True (default), then include the derivative of a bias column, the feature in which all polynomial powers
+        are zero.
+    """
+
+    def __init__(self, degree=2, interaction_only=False, include_bias=True):
+        self.F = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=include_bias)
+
+    def fit(self, X, y=None):
+        """
+        Compute number of output features.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data.
+        y : array, optional
+            Not used
+
+        Returns
+        -------
+        self : instance
+        """
+        return self
+
+    def transform(self, X):
+        """
+        Transform data to derivatives of polynomial features
+
+        Parameters
+        ----------
+        X: array-like, shape (n_samples, n_features)
+            The data to transform, row by row.
+
+        Returns
+        -------
+        XP: array-like, shape (n_samples, n_features, n_output_features)
+            The matrix of features, where `n_output_features` is the number of features that
+            would be returned from :class:`~sklearn.preprocessing.PolynomialFeatures`.
+        """
+        self.F.fit(X)
+        powers = self.F.powers_
+        result = np.zeros(X.shape + (self.F.n_output_features_,))
+        for i in range(X.shape[1]):
+            p = powers.copy()
+            c = powers[:, i]
+            p[:, i] -= 1
+            M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
+            result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
+        return result
+
+
+def _add_ones(arr):
+    """Add a column of ones to the front of an array."""
+    return np.hstack([np.ones((shape(arr)[0], 1)), arr])
+
+
+def _add_zeros(arr):
+    """Add a column of zeros to the front of an array."""
+    return np.hstack([np.zeros((shape(arr)[0], 1)), arr])
+
+
+class SieveTSLS(BaseCateEstimator):
+    """
+    Non-parametric instrumental variables estimator.
+
+    Supports the use of arbitrary featurizers for the features, treatments, and instruments.
+
+    Parameters
+    ----------
+    t_featurizer: transformer
+        Featurizer used to transform the treatments
+
+    x_featurizer: transformer
+        Featurizer used to transform the raw features
+
+    z_featurizer: transformer
+        Featurizer used to transform the instruments
+
+    dt_featurizer: transformer
+        Featurizer used to transform the treatments for the computation of the marginal effect.
+        This should produce a 3-dimensional array, containing the per-treatment derivative of
+        each transformed treatment. That is, given a treatment array of shape(n, dₜ),
+        the output should have shape(n, dₜ, fₜ), where fₜ is the number of columns produced by `t_featurizer`.
+
+    """
+
+    def __init__(self, *,
+                 t_featurizer,
+                 x_featurizer,
+                 z_featurizer,
+                 dt_featurizer):
+        self._t_featurizer = clone(t_featurizer, safe=False)
+        self._x_featurizer = clone(x_featurizer, safe=False)
+        self._z_featurizer = clone(z_featurizer, safe=False)
+        self._dt_featurizer = clone(dt_featurizer, safe=False)
+        # don't fit intercept; manually add column of ones to the data instead;
+        # this allows us to ignore the intercept when computing marginal effects
+        self._model_T = LinearRegression(fit_intercept=False)
+        self._model_Y = LinearRegression(fit_intercept=False)
+        super().__init__()
+
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    @BaseCateEstimator._wrap_fit
+    def fit(self, Y, T, X, W, Z, *, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates functions τ(·, ·, ·), ∂τ(·, ·).
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix
+            Outcomes for each sample
+        T: (n × dₜ) matrix
+            Treatments for each sample
+        X: optional(n × dₓ) matrix
+            Features for each sample
+        W: optional(n × d_w) matrix
+            Controls for each sample
+        Z: optional(n × d_z) matrix
+            Instruments for each sample
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`)
+
+        Returns
+        -------
+        self
+
+        """
+        Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z)
+        if X is None:
+            X = np.empty((shape(Y)[0], 0))
+        if W is None:
+            W = np.empty((shape(Y)[0], 0))
+        assert shape(Y)[0] == shape(T)[0] == shape(X)[0] == shape(W)[0] == shape(Z)[0]
+
+        # make T 2D if if was a vector
+        if ndim(T) == 1:
+            T = reshape(T, (-1, 1))
+
+        # store number of columns of W so that we can create correctly shaped zero array in effect and marginal effect
+        self._d_w = shape(W)[1]
+
+        # two stage approximation
+        # first, get basis expansions of T, X, and Z
+        ft_X = self._x_featurizer.fit_transform(X)
+        ft_Z = self._z_featurizer.fit_transform(Z)
+        ft_T = self._t_featurizer.fit_transform(T)
+        # TODO: is it right that the effective number of intruments is the
+        #       product of ft_X and ft_Z, not just ft_Z?
+        assert shape(ft_T)[1] <= shape(ft_X)[1] * shape(ft_Z)[1], ("There can be no more T features than the product "
+                                                                   "of the number of X and Z features; otherwise "
+                                                                   "there is not enough information to identify their "
+                                                                   "structure")
+
+        # regress T expansion on X,Z expansions concatenated with W
+        features = _add_ones(np.hstack([W, cross_product(ft_X, ft_Z)]))
+        self._model_T.fit(features, ft_T)
+        # predict ft_T from interacted ft_X, ft_Z
+        ft_T_hat = self._model_T.predict(features)
+        self._model_Y.fit(_add_ones(np.hstack([W, cross_product(ft_T_hat, ft_X)])), Y)
+
+    def effect(self, X=None, T0=0, T1=1):
+        """
+        Calculate the heterogeneous treatment effect τ(·,·,·).
+
+        The effect is calculated between the two treatment points
+        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T0: (m × dₜ) matrix or vector of length m
+            Base treatments for each sample
+        T1: (m × dₜ) matrix or vector of length m
+            Target treatments for each sample
+        X: optional (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        τ: (m × d_y) matrix
+            Heterogeneous treatment effects on each outcome for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+
+        """
+        if ndim(T0) == 0:
+            T0 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T0)
+        if ndim(T1) == 0:
+            T1 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T1)
+        if ndim(T0) == 1:
+            T0 = reshape(T0, (-1, 1))
+        if ndim(T1) == 1:
+            T1 = reshape(T1, (-1, 1))
+        if X is None:
+            X = np.empty((shape(T0)[0], 0))
+        assert shape(T0) == shape(T1)
+        assert shape(T0)[0] == shape(X)[0]
+
+        W = np.zeros((shape(T0)[0], self._d_w))  # can set arbitrarily since values will cancel
+        ft_X = self._x_featurizer.transform(X)
+        ft_T0 = self._t_featurizer.transform(T0)
+        ft_T1 = self._t_featurizer.transform(T1)
+        Y0 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T0, ft_X)])))
+        Y1 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T1, ft_X)])))
+        return Y1 - Y0
+
+    def marginal_effect(self, T, X=None):
+        """
+        Calculate the heterogeneous marginal effect ∂τ(·, ·).
+
+        The marginal effect is calculated around a base treatment
+        point conditional on a vector of features on a set of m test samples {Tᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: optional(m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        grad_tau: (m × d_y × dₜ) array
+            Heterogeneous marginal effects on each outcome for each sample
+            Note that when Y or T is a vector rather than a 2-dimensional array,
+            the corresponding singleton dimensions in the output will be collapsed
+            (e.g. if both are vectors, then the output of this method will also be a vector)
+        """
+        if X is None:
+            X = np.empty((shape(T)[0], 0))
+        assert shape(T)[0] == shape(X)[0]
+
+        ft_X = self._x_featurizer.transform(X)
+        n = shape(T)[0]
+        dT = self._dt_featurizer.transform(T if ndim(T) == 2 else reshape(T, (-1, 1)))
+        W = np.zeros((size(T), self._d_w))
+        # dT should be an n×dₜ×fₜ array (but if T was a vector, or if there is only one feature,
+        # dT may be only 2-dimensional)
+        # promote dT to 3D if necessary (e.g. if T was a vector)
+        if ndim(dT) < 3:
+            dT = reshape(dT, (n, 1, shape(dT)[1]))
+
+        # reshape ft_X and dT to allow cross product (result has shape n×dₜ×fₜ×f_x)
+        features = reshape(ft_X, (n, 1, 1, -1)) * reshape(dT, shape(dT) + (1,))
+        features = transpose(features, [0, 1, 3, 2])  # swap last two dims to match cross_product
+        features = reshape(features, (size(T), -1))
+        output = self._model_Y.predict(_add_zeros(np.hstack([W, features])))
+        output = reshape(output, shape(T) + shape(output)[1:])
+        if ndim(output) == 3:
+            return transpose(output, (0, 2, 1))  # transpose trailing T and Y dims
+        else:
+            return output
--- a/econml/metalearners/init.py
+++ b/econml/metalearners/init.py
@ -0,0 +1,9 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._metalearners import (TLearner, SLearner, XLearner, DomainAdaptationLearner)
+
+__all__ = ["TLearner",
+           "SLearner",
+           "XLearner",
+           "DomainAdaptationLearner"]
--- a/econml/metalearners/_metalearners.py
+++ b/econml/metalearners/_metalearners.py
@ -9,15 +9,15 @@ For more details on these CATE methods, see `<https://arxiv.org/abs/1706.03461>`

 import numpy as np
 import warnings
-from ._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
+from .._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
 from sklearn import clone
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.utils import check_array, check_X_y
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
-from .utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
-                        inverse_onehot, transpose, _EncoderWrapper, _deprecate_positional)
-from ._shap import _shap_explain_model_cate
+from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
+                         inverse_onehot, transpose, _EncoderWrapper, _deprecate_positional)
+from .._shap import _shap_explain_model_cate


 class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
--- a/econml/orf/init.py
+++ b/econml/orf/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+""" An implementation of Orthogonal Random Forests [orf]_ and special
+case python classes.
+
+References
+----------
+.. [orf] M. Oprescu, V. Syrgkanis and Z. S. Wu.
+    Orthogonal Random Forest for Causal Inference.
+    *Proceedings of the 36th International Conference on Machine Learning*, 2019.
+    URL http://proceedings.mlr.press/v97/oprescu19a.html.
+"""
+
+from ._ortho_forest import DMLOrthoForest, DROrthoForest
+
+__all__ = ["DMLOrthoForest",
+           "DROrthoForest"]
--- a/econml/orf/_causal_tree.py
+++ b/econml/orf/_causal_tree.py
--- a/econml/orf/_ortho_forest.py
+++ b/econml/orf/_ortho_forest.py
--- a/econml/ortho_forest.py
+++ b/econml/ortho_forest.py
--- a/econml/ortho_iv.py
+++ b/econml/ortho_iv.py
--- a/econml/sklearn_extensions/linear_model.py
+++ b/econml/sklearn_extensions/linear_model.py
@ -3,8 +3,7 @@

 """Collection of scikit-learn extensions for linear models.

-.. testcode::
-    :hide:
+.. testsetup::

    # Our classes that derive from sklearn ones sometimes include
    # inherited docstrings that have embedded doctests; we need the following imports
@ -199,6 +198,7 @@ class WeightedLasso(WeightedModelMixin, Lasso):
    n_iter_ : int | array-like, shape (n_targets,)
        number of iterations run by the coordinate descent solver to reach
        the specified tolerance.
+
    """

    def __init__(self, alpha=1.0, fit_intercept=True,
@ -294,6 +294,7 @@ class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
    n_iter_ : int | array-like, shape (n_targets,)
        number of iterations run by the coordinate descent solver to reach
        the specified tolerance.
+
    """

    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
@ -326,6 +327,11 @@ class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
 class WeightedLassoCV(WeightedModelMixin, LassoCV):
    """Version of sklearn LassoCV that accepts weights.

+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
    Parameters
    ----------
    eps : float, optional
@ -397,6 +403,7 @@ class WeightedLassoCV(WeightedModelMixin, LassoCV):
        rather than looping over features sequentially by default. This
        (setting to 'random') often leads to significantly faster convergence
        especially when tol is higher than 1e-4.
+
    """

    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
@ -438,6 +445,11 @@ class WeightedLassoCV(WeightedModelMixin, LassoCV):
 class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
    """Version of sklearn MultiTaskLassoCV that accepts weights.

+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
    Parameters
    ----------
    eps : float, optional
@ -502,6 +514,7 @@ class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
        rather than looping over features sequentially by default. This
        (setting to 'random') often leads to significantly faster convergence
        especially when tol is higher than 1e-4.
+
    """

    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
@ -575,6 +588,11 @@ class DebiasedLasso(WeightedLasso):

    Only implemented for single-dimensional output.

+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
    Parameters
    ----------
    alpha : string | float, optional, default 'auto'.
--- a/econml/tests/test_automated_ml.py
+++ b/econml/tests/test_automated_ml.py
@ -11,7 +11,7 @@ from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, \
 from sklearn.model_selection import KFold
 from econml.dml import *
 from econml.metalearners import *
-from econml.drlearner import DRLearner
+from econml.dr import DRLearner
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, \
    cross_product
--- a/econml/tests/test_bootstrap.py
+++ b/econml/tests/test_bootstrap.py
@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-from econml.bootstrap import BootstrapEstimator
+from econml.inference._bootstrap import BootstrapEstimator
 from econml.inference import BootstrapInference
 from econml.dml import LinearDML
-from econml.ortho_iv import LinearIntentToTreatDRIV
-from econml.two_stage_least_squares import NonparametricTwoStageLeastSquares
+from econml.iv.dr import LinearIntentToTreatDRIV
+from econml.iv.sieve import SieveTSLS
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.preprocessing import PolynomialFeatures
 import numpy as np
@ -235,10 +235,10 @@ class TestBootstrap(unittest.TestCase):

        opts = BootstrapInference(50, 2)

-        est = NonparametricTwoStageLeastSquares(t_featurizer=PolynomialFeatures(2),
-                                                x_featurizer=PolynomialFeatures(2),
-                                                z_featurizer=PolynomialFeatures(2),
-                                                dt_featurizer=None)
+        est = SieveTSLS(t_featurizer=PolynomialFeatures(2),
+                        x_featurizer=PolynomialFeatures(2),
+                        z_featurizer=PolynomialFeatures(2),
+                        dt_featurizer=None)
        est.fit(y, t, X=x, W=None, Z=z, inference=opts)

        # test that we can get an interval for the same attribute for the bootstrap as the original,
@ -274,7 +274,7 @@ class TestBootstrap(unittest.TestCase):
        Y = [1, 2, 3, 4, 5, 6]
        X = np.array([1, 1, 2, 2, 1, 2]).reshape(-1, 1)
        est = LinearDML(model_y=LinearRegression(), model_t=LogisticRegression(), discrete_treatment=True)
-        inference = BootstrapInference(n_bootstrap_samples=5)
+        inference = BootstrapInference(n_bootstrap_samples=5, n_jobs=-1, verbose=0)
        est.fit(Y, T, inference=inference)
        est.const_marginal_effect_interval()

@ -292,7 +292,7 @@ class TestBootstrap(unittest.TestCase):
        X = np.array([1, 1, 2, 2, 1, 2, 1, 2]).reshape(-1, 1)
        est = LinearIntentToTreatDRIV(model_Y_X=LinearRegression(), model_T_XZ=LogisticRegression(),
                                      flexible_model_effect=LinearRegression(), cv=2)
-        inference = BootstrapInference(n_bootstrap_samples=20)
+        inference = BootstrapInference(n_bootstrap_samples=20, n_jobs=-1, verbose=3)
        est.fit(Y, T, Z=Z, X=X, inference=inference)
        est.const_marginal_effect_interval(X)

@ -303,7 +303,7 @@ class TestBootstrap(unittest.TestCase):
        est = LinearDML(cv=2)
        for kind in ['percentile', 'pivot', 'normal']:
            with self.subTest(kind=kind):
-                inference = BootstrapInference(n_bootstrap_samples=5, bootstrap_type=kind)
+                inference = BootstrapInference(n_bootstrap_samples=5, n_jobs=-1, verbose=0, bootstrap_type=kind)
                est.fit(Y, T, inference=inference)
                i = est.const_marginal_effect_interval()
                inf = est.const_marginal_effect_inference()
--- a/econml/tests/test_deepiv.py
+++ b/econml/tests/test_deepiv.py
@ -13,8 +13,9 @@ import keras.backend as K

 import pytest

-from econml.deepiv import _zero_grad
-from econml.deepiv import mog_model, mog_loss_model, mog_sample_model, response_loss_model, DeepIV
+from econml.iv.nnet._deepiv import _zero_grad
+from econml.iv.nnet import DeepIV
+from econml.iv.nnet._deepiv import mog_model, mog_loss_model, mog_sample_model, response_loss_model
 from econml.utilities import reshape


--- a/econml/tests/test_drlearner.py
+++ b/econml/tests/test_drlearner.py
@ -13,7 +13,7 @@ from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 from sklearn.model_selection import KFold, GroupKFold
 from sklearn.preprocessing import PolynomialFeatures
-from econml.drlearner import DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner
+from econml.dr import DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference, StatsModelsInferenceDiscrete
 from contextlib import ExitStack
--- a/econml/tests/test_inference.py
+++ b/econml/tests/test_inference.py
@ -8,7 +8,7 @@ from sklearn.base import clone
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
 from econml.dml import LinearDML, DML, NonParamDML
-from econml.drlearner import LinearDRLearner, DRLearner
+from econml.dr import LinearDRLearner, DRLearner
 from econml.inference import (BootstrapInference, NormalInferenceResults,
                              EmpiricalInferenceResults, PopulationSummaryResults)
 from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, DebiasedLasso
--- a/econml/tests/test_montecarlo.py
+++ b/econml/tests/test_montecarlo.py
@ -6,9 +6,9 @@
 import unittest
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from econml.dml import (DML, LinearDML, SparseLinearDML, KernelDML, NonParamDML, ForestDML)
-from econml.drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.dr import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np


--- a/econml/tests/test_orf.py
+++ b/econml/tests/test_orf.py
@ -10,7 +10,7 @@ from sklearn.exceptions import DataConversionWarning
 from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.pipeline import Pipeline
-from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+from econml.orf import DMLOrthoForest, DROrthoForest
 from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper


--- a/econml/tests/test_ortho_iv.py
+++ b/econml/tests/test_ortho_iv.py
@ -8,8 +8,8 @@ from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
 from sklearn.model_selection import KFold
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference
--- a/econml/tests/test_random_state.py
+++ b/econml/tests/test_random_state.py
@ -7,9 +7,9 @@ from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, Polynomial
 from sklearn.model_selection import KFold, GroupKFold
 from econml.dml import DML, LinearDML, SparseLinearDML, KernelDML
 from econml.dml import NonParamDML, CausalForestDML
-from econml.drlearner import DRLearner, SparseLinearDRLearner, LinearDRLearner, ForestDRLearner
-from econml.ortho_iv import DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,\
-    IntentToTreatDRIV, LinearIntentToTreatDRIV
+from econml.dr import DRLearner, SparseLinearDRLearner, LinearDRLearner, ForestDRLearner
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference
--- a/econml/tests/test_refit.py
+++ b/econml/tests/test_refit.py
@ -6,9 +6,9 @@ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import Pipeline
 from econml.dml import (DML, LinearDML, SparseLinearDML, KernelDML, NonParamDML, ForestDML)
-from econml.drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.dr import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 from econml.sklearn_extensions.linear_model import (DebiasedLasso, WeightedLasso,
                                                    StatsModelsRLM, StatsModelsLinearRegression)
 from econml.inference import NormalInferenceResults, BootstrapInference
--- a/econml/tests/test_rscorer.py
+++ b/econml/tests/test_rscorer.py
@ -11,7 +11,7 @@ from joblib import Parallel, delayed

 from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
 from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
-from econml.drlearner import DRLearner
+from econml.dr import DRLearner
 from econml.score import RScorer


--- a/econml/tests/test_shap.py
+++ b/econml/tests/test_shap.py
@ -6,10 +6,10 @@ import unittest
 import shap
 from shap.plots import scatter, heatmap, bar, beeswarm, waterfall, force
 from econml.dml import *
-from econml.ortho_forest import *
-from econml.drlearner import *
+from econml.orf import *
+from econml.dr import *
 from econml.metalearners import *
-from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.preprocessing import PolynomialFeatures

--- a/econml/tests/test_two_stage_least_squares.py
+++ b/econml/tests/test_two_stage_least_squares.py
@ -9,7 +9,7 @@ import warnings
 import pytest

 from econml.utilities import shape, reshape
-from econml.two_stage_least_squares import (NonparametricTwoStageLeastSquares, HermiteFeatures, DPolynomialFeatures)
+from econml.iv.sieve import (SieveTSLS, HermiteFeatures, DPolynomialFeatures)
 from sklearn.linear_model import LinearRegression
 from sklearn.preprocessing import PolynomialFeatures

@ -73,7 +73,7 @@ class Test2SLS(unittest.TestCase):
                        d_w = 1
                        if d_z >= n_t:
                            T, Y, X, Z, W = [make_random(d) for d in [d_t, d_y, d_x, d_z, d_w]]
-                            est = NonparametricTwoStageLeastSquares(
+                            est = SieveTSLS(
                                t_featurizer=PolynomialFeatures(),
                                x_featurizer=PolynomialFeatures(),
                                z_featurizer=PolynomialFeatures(),
@ -100,7 +100,7 @@ class Test2SLS(unittest.TestCase):
        T = np.hstack([np.cross(X, Z).reshape(-1, 1) + W, (np.prod(X, axis=1) + np.prod(Z, axis=1)).reshape(-1, 1)])
        Y = X * T + X**2

-        est = NonparametricTwoStageLeastSquares(
+        est = SieveTSLS(
            t_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
            x_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
            z_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
@ -149,10 +149,10 @@ class Test2SLS(unittest.TestCase):
        p_fresh = x_fresh + z_fresh * e_fresh + np.random.uniform(size=(n, d_t))

        for (dt, dx, dz) in [(0, 0, 0), (1, 1, 1), (5, 5, 5), (10, 10, 10), (3, 3, 10), (10, 10, 3)]:
-            np2sls = NonparametricTwoStageLeastSquares(t_featurizer=HermiteFeatures(dt),
-                                                       x_featurizer=HermiteFeatures(dx),
-                                                       z_featurizer=HermiteFeatures(dz),
-                                                       dt_featurizer=HermiteFeatures(dt, shift=1))
+            np2sls = SieveTSLS(t_featurizer=HermiteFeatures(dt),
+                               x_featurizer=HermiteFeatures(dx),
+                               z_featurizer=HermiteFeatures(dz),
+                               dt_featurizer=HermiteFeatures(dt, shift=1))
            np2sls.fit(y, p, X=x, W=w, Z=z)
            effect = np2sls.effect(x_fresh, np.zeros(shape(p_fresh)), p_fresh)
            losses.append(np.mean(np.square(p_fresh * x_fresh - effect)))
--- a/econml/two_stage_least_squares.py
+++ b/econml/two_stage_least_squares.py
@ -1,362 +1,26 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Provides a non-parametric two-stage least squares instrumental variable estimator."""
-
-import numpy as np
-from copy import deepcopy
-from sklearn import clone
-from sklearn.linear_model import LinearRegression
-from .utilities import shape, transpose, reshape, cross_product, ndim, size,\
-    _deprecate_positional, check_input_arrays
-from ._cate_estimator import BaseCateEstimator, LinearCateEstimator
-from numpy.polynomial.hermite_e import hermeval
-from sklearn.base import TransformerMixin
-from sklearn.preprocessing import PolynomialFeatures
-from itertools import product
-
-
-class HermiteFeatures(TransformerMixin):
-    """
-    Featurizer that returns(unscaled) Hermite function evaluations.
-
-    The evaluated functions are of degrees 0..`degree`, differentiated `shift` times.
-
-    If the input has shape(n, x) and `joint` is False, the output will have shape(n, (`degree`+ 1)×x) if `shift` is 0.
-    If the input has shape(n, x) and `joint` is True, the output will have shape(n, (`degree`+ 1) ^ x) if `shift` is 0.
-    In either case, if `shift` is nonzero there will be `shift` additional dimensions of size x
-    between the first and last.
-    """
-
-    def __init__(self, degree, shift=0, joint=False):
-        self._degree = degree
-        self._shift = shift
-        self._joint = joint
-
-    def _column_feats(self, X, shift):
-        """
-        Apply Hermite function evaluations of degrees 0..`degree` differentiated `shift` times.
-
-        When applied to the column `X` of shape(n,), the resulting array has shape(n, (degree + 1)).
-        """
-        assert ndim(X) == 1
-        # this will have dimension (d,) + shape(X)
-        coeffs = np.identity(self._degree + shift + 1)[:, shift:]
-        feats = ((-1) ** shift) * hermeval(X, coeffs) * np.exp(-X * X / 2)
-        # send the first dimension to the end
-        return transpose(feats)
-
-    def fit(self, X):
-        """Fits the data(a NOP for this class) and returns self."""
-        return self
-
-    def transform(self, X):
-        """
-        Transform the data by applying the appropriate Hermite functions.
-
-        Parameters
-        ----------
-        X: array_like
-            2-dimensional array of input features
-
-        Returns
-        -------
-        The transformed data
-        """
-        assert ndim(X) == 2
-        n = shape(X)[0]
-        ncols = shape(X)[1]
-        columns = []
-        for indices in product(*[range(ncols) for i in range(self._shift)]):
-            if self._joint:
-                columns.append(cross_product(*[self._column_feats(X[:, i], indices.count(i))
-                                               for i in range(shape(X)[1])]))
-            else:
-                indices = set(indices)
-                if self._shift == 0:  # return features for all columns:
-                    columns.append(np.hstack([self._column_feats(X[:, i], self._shift) for i in range(shape(X)[1])]))
-                # columns are featurized independently; partial derivatives are only non-zero
-                # when taken with respect to the same column each time
-                elif len(indices) == 1:
-                    index = list(indices)[0]
-                    feats = self._column_feats(X[:, index], self._shift)
-                    columns.append(np.hstack([feats if i == index else np.zeros(shape(feats))
-                                              for i in range(shape(X)[1])]))
-                else:
-                    columns.append(np.zeros((n, (self._degree + 1) * ncols)))
-        return reshape(np.hstack(columns), (n,) + (ncols,) * self._shift + (-1,))
-
-
-class DPolynomialFeatures(TransformerMixin):
-    """
-    Featurizer that returns the derivatives of :class:`~sklearn.preprocessing.PolynomialFeatures` features in
-    a way that's compativle with the expectations of :class:`.NonparametricTwoStageLeastSquares`'s
-    `dt_featurizer` parameter.
-
-    If the input has shape `(n, x)` and
-    :meth:`PolynomialFeatures.transform<sklearn.preprocessing.PolynomialFeatures.transform>` returns an output
-    of shape `(n, f)`, then :meth:`.transform` will return an array of shape `(n, x, f)`.
-
-    Parameters
-    ----------
-    degree: integer, default = 2
-        The degree of the polynomial features.
-
-    interaction_only: boolean, default = False
-        If true, only derivatives of interaction features are produced: features that are products of at most degree
-        distinct input features (so not `x[1] ** 2`, `x[0] * x[2] ** 3`, etc.).
-
-    include_bias: boolean, default = True
-        If True (default), then include the derivative of a bias column, the feature in which all polynomial powers
-        are zero.
-    """
-
-    def __init__(self, degree=2, interaction_only=False, include_bias=True):
-        self.F = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=include_bias)
-
-    def fit(self, X, y=None):
-        """
-        Compute number of output features.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            The data.
-        y : array, optional
-            Not used
-
-        Returns
-        -------
-        self : instance
-        """
-        return self
-
-    def transform(self, X):
-        """
-        Transform data to derivatives of polynomial features
-
-        Parameters
-        ----------
-        X: array-like, shape (n_samples, n_features)
-            The data to transform, row by row.
-
-        Returns
-        -------
-        XP: array-like, shape (n_samples, n_features, n_output_features)
-            The matrix of features, where `n_output_features` is the number of features that
-            would be returned from :class:`~sklearn.preprocessing.PolynomialFeatures`.
-        """
-        self.F.fit(X)
-        powers = self.F.powers_
-        result = np.zeros(X.shape + (self.F.n_output_features_,))
-        for i in range(X.shape[1]):
-            p = powers.copy()
-            c = powers[:, i]
-            p[:, i] -= 1
-            M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
-            result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
-        return result
-
-
-def _add_ones(arr):
-    """Add a column of ones to the front of an array."""
-    return np.hstack([np.ones((shape(arr)[0], 1)), arr])
-
-
-def _add_zeros(arr):
-    """Add a column of zeros to the front of an array."""
-    return np.hstack([np.zeros((shape(arr)[0], 1)), arr])
-
-
-class NonparametricTwoStageLeastSquares(BaseCateEstimator):
-    """
-    Non-parametric instrumental variables estimator.
-
-    Supports the use of arbitrary featurizers for the features, treatments, and instruments.
-
-    Parameters
-    ----------
-    t_featurizer: transformer
-        Featurizer used to transform the treatments
-
-    x_featurizer: transformer
-        Featurizer used to transform the raw features
-
-    z_featurizer: transformer
-        Featurizer used to transform the instruments
-
-    dt_featurizer: transformer
-        Featurizer used to transform the treatments for the computation of the marginal effect.
-        This should produce a 3-dimensional array, containing the per-treatment derivative of
-        each transformed treatment. That is, given a treatment array of shape(n, dₜ),
-        the output should have shape(n, dₜ, fₜ), where fₜ is the number of columns produced by `t_featurizer`.
-
-    """
-
-    def __init__(self, *,
-                 t_featurizer,
-                 x_featurizer,
-                 z_featurizer,
-                 dt_featurizer):
-        self._t_featurizer = clone(t_featurizer, safe=False)
-        self._x_featurizer = clone(x_featurizer, safe=False)
-        self._z_featurizer = clone(z_featurizer, safe=False)
-        self._dt_featurizer = clone(dt_featurizer, safe=False)
-        # don't fit intercept; manually add column of ones to the data instead;
-        # this allows us to ignore the intercept when computing marginal effects
-        self._model_T = LinearRegression(fit_intercept=False)
-        self._model_Y = LinearRegression(fit_intercept=False)
-        super().__init__()
-
-    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    @BaseCateEstimator._wrap_fit
-    def fit(self, Y, T, X, W, Z, *, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates functions τ(·, ·, ·), ∂τ(·, ·).
-
-        Parameters
-        ----------
-        Y: (n × d_y) matrix
-            Outcomes for each sample
-        T: (n × dₜ) matrix
-            Treatments for each sample
-        X: optional(n × dₓ) matrix
-            Features for each sample
-        W: optional(n × d_w) matrix
-            Controls for each sample
-        Z: optional(n × d_z) matrix
-            Instruments for each sample
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`)
-
-        Returns
-        -------
-        self
-
-        """
-        Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z)
-        if X is None:
-            X = np.empty((shape(Y)[0], 0))
-        if W is None:
-            W = np.empty((shape(Y)[0], 0))
-        assert shape(Y)[0] == shape(T)[0] == shape(X)[0] == shape(W)[0] == shape(Z)[0]
-
-        # make T 2D if if was a vector
-        if ndim(T) == 1:
-            T = reshape(T, (-1, 1))
-
-        # store number of columns of W so that we can create correctly shaped zero array in effect and marginal effect
-        self._d_w = shape(W)[1]
-
-        # two stage approximation
-        # first, get basis expansions of T, X, and Z
-        ft_X = self._x_featurizer.fit_transform(X)
-        ft_Z = self._z_featurizer.fit_transform(Z)
-        ft_T = self._t_featurizer.fit_transform(T)
-        # TODO: is it right that the effective number of intruments is the
-        #       product of ft_X and ft_Z, not just ft_Z?
-        assert shape(ft_T)[1] <= shape(ft_X)[1] * shape(ft_Z)[1], ("There can be no more T features than the product "
-                                                                   "of the number of X and Z features; otherwise "
-                                                                   "there is not enough information to identify their "
-                                                                   "structure")
-
-        # regress T expansion on X,Z expansions concatenated with W
-        features = _add_ones(np.hstack([W, cross_product(ft_X, ft_Z)]))
-        self._model_T.fit(features, ft_T)
-        # predict ft_T from interacted ft_X, ft_Z
-        ft_T_hat = self._model_T.predict(features)
-        self._model_Y.fit(_add_ones(np.hstack([W, cross_product(ft_T_hat, ft_X)])), Y)
-
-    def effect(self, X=None, T0=0, T1=1):
-        """
-        Calculate the heterogeneous treatment effect τ(·,·,·).
-
-        The effect is calculated between the two treatment points
-        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T0: (m × dₜ) matrix or vector of length m
-            Base treatments for each sample
-        T1: (m × dₜ) matrix or vector of length m
-            Target treatments for each sample
-        X: optional (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        τ: (m × d_y) matrix
-            Heterogeneous treatment effects on each outcome for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-
-        """
-        if ndim(T0) == 0:
-            T0 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T0)
-        if ndim(T1) == 0:
-            T1 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T1)
-        if ndim(T0) == 1:
-            T0 = reshape(T0, (-1, 1))
-        if ndim(T1) == 1:
-            T1 = reshape(T1, (-1, 1))
-        if X is None:
-            X = np.empty((shape(T0)[0], 0))
-        assert shape(T0) == shape(T1)
-        assert shape(T0)[0] == shape(X)[0]
-
-        W = np.zeros((shape(T0)[0], self._d_w))  # can set arbitrarily since values will cancel
-        ft_X = self._x_featurizer.transform(X)
-        ft_T0 = self._t_featurizer.transform(T0)
-        ft_T1 = self._t_featurizer.transform(T1)
-        Y0 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T0, ft_X)])))
-        Y1 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T1, ft_X)])))
-        return Y1 - Y0
-
-    def marginal_effect(self, T, X=None):
-        """
-        Calculate the heterogeneous marginal effect ∂τ(·, ·).
-
-        The marginal effect is calculated around a base treatment
-        point conditional on a vector of features on a set of m test samples {Tᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: optional(m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        grad_tau: (m × d_y × dₜ) array
-            Heterogeneous marginal effects on each outcome for each sample
-            Note that when Y or T is a vector rather than a 2-dimensional array,
-            the corresponding singleton dimensions in the output will be collapsed
-            (e.g. if both are vectors, then the output of this method will also be a vector)
-        """
-        if X is None:
-            X = np.empty((shape(T)[0], 0))
-        assert shape(T)[0] == shape(X)[0]
-
-        ft_X = self._x_featurizer.transform(X)
-        n = shape(T)[0]
-        dT = self._dt_featurizer.transform(T if ndim(T) == 2 else reshape(T, (-1, 1)))
-        W = np.zeros((size(T), self._d_w))
-        # dT should be an n×dₜ×fₜ array (but if T was a vector, or if there is only one feature,
-        # dT may be only 2-dimensional)
-        # promote dT to 3D if necessary (e.g. if T was a vector)
-        if ndim(dT) < 3:
-            dT = reshape(dT, (n, 1, shape(dT)[1]))
-
-        # reshape ft_X and dT to allow cross product (result has shape n×dₜ×fₜ×f_x)
-        features = reshape(ft_X, (n, 1, 1, -1)) * reshape(dT, shape(dT) + (1,))
-        features = transpose(features, [0, 1, 3, 2])  # swap last two dims to match cross_product
-        features = reshape(features, (size(T), -1))
-        output = self._model_Y.predict(_add_zeros(np.hstack([W, features])))
-        output = reshape(output, shape(T) + shape(output)[1:])
-        if ndim(output) == 3:
-            return transpose(output, (0, 2, 1))  # transpose trailing T and Y dims
-        else:
-            return output
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.iv.sieve as sieve
+from .utilities import deprecated
+
+
+@deprecated("The econml.two_stage_least_squares.HermiteFeatures class has been moved "
+            "to econml.iv.sieve.HermiteFeatures; "
+            "an upcoming release will remove support for the old name")
+class HermiteFeatures(sieve.HermiteFeatures):
+    pass
+
+
+@deprecated("The econml.two_stage_least_squares.DPolynomialFeatures class has been moved "
+            "to econml.iv.sieve.DPolynomialFeatures; "
+            "an upcoming release will remove support for the old name")
+class DPolynomialFeatures(sieve.DPolynomialFeatures):
+    pass
+
+
+@deprecated("The econml.two_stage_least_squares.NonparametricTwoStageLeastSquares class has been moved "
+            "to econml.iv.sieve.SieveTSLS; "
+            "an upcoming release will remove support for the old name")
+class NonparametricTwoStageLeastSquares(sieve.SieveTSLS):
+    pass
--- a/monte_carlo_tests/monte_carlo_honestforest.py
+++ b/monte_carlo_tests/monte_carlo_honestforest.py
@ -5,7 +5,7 @@ import time
 import argparse
 import warnings
 import joblib
-from econml.sklearn_extensions.ensemble import SubsampledHonestForest
+from econml.grf import RegressionForest


 def monte_carlo():
@ -19,7 +19,7 @@ def monte_carlo():
        print(it)
        X = np.random.normal(0, 1, size=(n, d))
        y = X[:, 0] + np.random.normal(size=(n,))
-        est = SubsampledHonestForest(n_estimators=1000, verbose=1)
+        est = RegressionForest(n_estimators=1000, verbose=1)
        est.fit(X, y)
        point = est.predict(X_test)
        low, up = est.predict_interval(X_test, alpha=0.05)
@ -47,5 +47,5 @@ def monte_carlo():


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Monte Carlo Coverage Tests for the SubsampledHonestForest')
+    parser = argparse.ArgumentParser(description='Monte Carlo Coverage Tests for the RegressionForest')
    monte_carlo()
--- a/notebooks/Causal
+++ b/notebooks/Causal
--- a/notebooks/Causal
+++ b/notebooks/Causal
@ -61,7 +61,7 @@
    "# Main imports\n",
    "from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML\n",
    "from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
--- a/notebooks/CustomerScenarios/Case
+++ b/notebooks/CustomerScenarios/Case
@ -77,7 +77,7 @@
    "from xgboost import XGBRegressor, XGBClassifier\n",
    "\n",
    "# EconML imports\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
    "\n",
    "# DoWhy imports \n",
    "import dowhy\n",
@ -611,7 +611,7 @@
   "source": [
    "test_customers = X.iloc[:1000].values\n",
    "drlearner_estimate = model.estimate_effect(identified_estimand,\n",
-    "                                      method_name=\"backdoor.econml.drlearner.LinearDRLearner\",\n",
+    "                                      method_name=\"backdoor.econml.dr.LinearDRLearner\",\n",
    "                                      target_units = test_customers,\n",
    "                                      treatment_value = 1,\n",
    "                                      method_params={\n",
--- a/notebooks/CustomerScenarios/Case
+++ b/notebooks/CustomerScenarios/Case
@ -63,7 +63,7 @@
    "from xgboost import XGBRegressor, XGBClassifier\n",
    "\n",
    "# EconML imports\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
@ -436,7 +436,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x221d9532108>"
+       "<econml.dr.LinearDRLearner at 0x221d9532108>"
      ]
     },
     "execution_count": 8,
--- a/notebooks/CustomerScenarios/Case
+++ b/notebooks/CustomerScenarios/Case
@ -94,7 +94,7 @@
    "from dowhy import CausalModel\n",
    "\n",
    "# EconML imports\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
    "from econml.cate_interpreter import SingleTreeCateInterpreter, \\\n",
    "                                    SingleTreePolicyInterpreter\n",
    "\n",
@ -861,7 +861,7 @@
   "source": [
    "test_customers = X_data.iloc[:1000]\n",
    "driv_estimate = model.estimate_effect(identified_estimand,\n",
-    "                                      method_name=\"iv.econml.ortho_iv.LinearIntentToTreatDRIV\",\n",
+    "                                      method_name=\"iv.econml.iv.dr.LinearIntentToTreatDRIV\",\n",
    "                                      target_units = test_customers,\n",
    "                                      method_params={\n",
    "                                          \"init_params\":{'model_T_XZ': model_T_XZ, \n",
--- a/notebooks/CustomerScenarios/Case
+++ b/notebooks/CustomerScenarios/Case
@ -82,7 +82,7 @@
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "# EconML imports\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
    "from econml.cate_interpreter import SingleTreeCateInterpreter, \\\n",
    "                                    SingleTreePolicyInterpreter\n",
    "\n",
@ -419,7 +419,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.ortho_iv.LinearIntentToTreatDRIV at 0x1b12c1e64c8>"
+       "<econml.iv.dr.LinearIntentToTreatDRIV at 0x1b12c1e64c8>"
      ]
     },
     "execution_count": 7,
--- a/Examples.ipynb
+++ b/Examples.ipynb
@ -43,7 +43,7 @@
    "\n",
    "### Using the SDK\n",
    "\n",
-    "In the `econml` package, our Deep IV estimator is built on top of the Keras framework; we support either the Tensorflow or the Theano backends.  There are three steps to using the `DeepIVEstimator`:\n",
+    "In the `econml` package, our Deep IV estimator is built on top of the Keras framework; we support either the Tensorflow or the Theano backends.  There are three steps to using the `DeepIV`:\n",
    "\n",
    "1. Construct an instance.  \n",
    "    * The `m` and `h` arguments to the initializer specify deep neural network models for estimating `T` and `Y` as described above.  They are each *functions* that take two Keras inputs and return a Keras model (the inputs are `z` and `x` in the case of `m` and the output's shape should match `t`'s; the inputs are `t` and `x` in the case of `h` and the output's shape should match `y`'s).  Note that the `h` function will be called multiple times, but should reuse the same weights - see below for a concrete example of how to achieve this using the Keras API.\n",
@ -71,7 +71,7 @@
    }
   ],
   "source": [
-    "from econml.deepiv import DeepIVEstimator\n",
+    "from econml.iv.nnet import DeepIV\n",
    "import keras\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
@ -226,7 +226,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Now we'll instantiate the `DeepIVEstimator` class using these models.  Defining the response model *outside* of the lambda passed into constructor is important, because (depending on the settings for the loss) it can be used multiple times in the second stage and we want the same weights to be used every time."
+    "Now we'll instantiate the `DeepIV` class using these models.  Defining the response model *outside* of the lambda passed into constructor is important, because (depending on the settings for the loss) it can be used multiple times in the second stage and we want the same weights to be used every time."
   ]
  },
  {
@ -239,15 +239,15 @@
    "                      \"validation_split\": 0.1,\n",
    "                      \"callbacks\": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]}\n",
    "\n",
-    "deepIvEst = DeepIVEstimator(n_components = 10, # number of gaussians in our mixture density network\n",
-    "                            m = lambda z, x : treatment_model(keras.layers.concatenate([z,x])), # treatment model\n",
-    "                            h = lambda t, x : response_model(keras.layers.concatenate([t,x])),  # response model\n",
-    "                            n_samples = 1, # number of samples to use to estimate the response\n",
-    "                            use_upper_bound_loss = False, # whether to use an approximation to the true loss\n",
-    "                            n_gradient_samples = 1, # number of samples to use in second estimate of the response (to make loss estimate unbiased)\n",
-    "                            optimizer='adam', # Keras optimizer to use for training - see https://keras.io/optimizers/ \n",
-    "                            first_stage_options = keras_fit_options, # options for training treatment model\n",
-    "                            second_stage_options = keras_fit_options) # options for training response model"
+    "deepIvEst = DeepIV(n_components = 10, # number of gaussians in our mixture density network\n",
+    "                   m = lambda z, x : treatment_model(keras.layers.concatenate([z,x])), # treatment model\n",
+    "                   h = lambda t, x : response_model(keras.layers.concatenate([t,x])),  # response model\n",
+    "                   n_samples = 1, # number of samples to use to estimate the response\n",
+    "                   use_upper_bound_loss = False, # whether to use an approximation to the true loss\n",
+    "                   n_gradient_samples = 1, # number of samples to use in second estimate of the response (to make loss estimate unbiased)\n",
+    "                   optimizer='adam', # Keras optimizer to use for training - see https://keras.io/optimizers/ \n",
+    "                   first_stage_options = keras_fit_options, # options for training treatment model\n",
+    "                   second_stage_options = keras_fit_options) # options for training response model"
   ]
  },
  {
--- a/Interpretability.ipynb
+++ b/Interpretability.ipynb
@ -86,7 +86,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x2e3934ec710>"
+       "<econml.dr.LinearDRLearner at 0x2e3934ec710>"
      ]
     },
     "execution_count": 3,
@ -96,7 +96,7 @@
   ],
   "source": [
    "from sklearn.linear_model import LassoCV\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
@ -514,7 +514,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x2e39fe94240>"
+       "<econml.dr.LinearDRLearner at 0x2e39fe94240>"
      ]
     },
     "execution_count": 12,
@ -524,7 +524,7 @@
   ],
   "source": [
    "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.dummy import DummyClassifier\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
@ -791,7 +791,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.SparseLinearDRLearner at 0x2e39fe94d30>"
+       "<econml.dr.SparseLinearDRLearner at 0x2e39fe94d30>"
      ]
     },
     "execution_count": 16,
@ -801,7 +801,7 @@
   ],
   "source": [
    "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
-    "from econml.drlearner import SparseLinearDRLearner\n",
+    "from econml.dr import SparseLinearDRLearner\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.dummy import DummyClassifier\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
@ -1143,7 +1143,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.ForestDRLearner at 0x2e3a0353400>"
+       "<econml.dr.ForestDRLearner at 0x2e3a0353400>"
      ]
     },
     "execution_count": 21,
@ -1152,7 +1152,7 @@
    }
   ],
   "source": [
-    "from econml.drlearner import ForestDRLearner\n",
+    "from econml.dr import ForestDRLearner\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "est = ForestDRLearner(model_regression=GradientBoostingRegressor(),\n",
@ -1690,7 +1690,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.DRLearner at 0x2e3a29294a8>"
+       "<econml.dr.DRLearner at 0x2e3a29294a8>"
      ]
     },
     "execution_count": 38,
@ -1700,7 +1700,7 @@
   ],
   "source": [
    "# We need to use a scikit-learn final model\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier\n",
    "\n",
    "# One can replace model_y and model_t with any scikit-learn regressor and classifier correspondingly\n",
--- a/notebooks/ForestLearners
+++ b/notebooks/ForestLearners
@ -490,7 +490,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.ForestDRLearner at 0x2496cd1b860>"
+       "<econml.dr.ForestDRLearner at 0x2496cd1b860>"
      ]
     },
     "execution_count": 19,
@ -499,7 +499,7 @@
    }
   ],
   "source": [
-    "from econml.drlearner import ForestDRLearner\n",
+    "from econml.dr import ForestDRLearner\n",
    "from sklearn.dummy import DummyRegressor, DummyClassifier\n",
    "\n",
    "est = ForestDRLearner(model_regression=model_y,\n",
@ -520,7 +520,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.drlearner.DRLearner at 0x2496cd9c2e8>"
+       "<econml.dr.DRLearner at 0x2496cd9c2e8>"
      ]
     },
     "execution_count": 20,
@ -529,7 +529,7 @@
    }
   ],
   "source": [
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
    "est2 = DRLearner(model_regression=model_y,\n",
    "                 model_propensity=model_t,\n",
    "                 model_final=final_stage(),\n",
@ -780,7 +780,7 @@
    {
     "data": {
      "text/plain": [
-       "<econml.ortho_forest.DROrthoForest at 0x24970f68780>"
+       "<econml.orf.DROrthoForest at 0x24970f68780>"
      ]
     },
     "execution_count": 30,
@ -789,7 +789,7 @@
    }
   ],
   "source": [
-    "from econml.ortho_forest import DROrthoForest\n",
+    "from econml.orf import DROrthoForest\n",
    "from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV\n",
    "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
    "\n",
--- a/notebooks/Generalized
+++ b/notebooks/Generalized
@ -23,7 +23,7 @@
    "\n",
    "Causal Forests and [Generalized Random Forests](https://arxiv.org/pdf/1610.01271.pdf) are a flexible method for estimating treatment effect heterogeneity with Random Forests. The `econml.grf` module implements a high-performance Cython version of the [`grf`](https://github.com/grf-labs/grf) R-package, with support for CausalForests, IVForests and RegressionForests. The module provides estimators that adhere to the scikit-learn fit and predict API, as well as providing methods for uncertainty quantification and confidence intervals.\n",
    "\n",
-    "Within the EconML SDK we use these estimators as final models for CATE estimation, such as in the case of `econml.dml.CausalForestDML`, where we combine a Causal Forest with Double Machine Learning, to residualize the treatment and outcome and call the `econml.grf.CausalForest` on the residuals. Similarly, the `econml.drlearner.ForestDRLearner` uses an `econml.grf.RegressionForest` as a final stage estimator on the doubly robust targets estimated by the first stage. The estimators here should primarily be used in conjunction with CateEstimators and not as standalone, but we provide here examples of their direct usage functionality.\n",
+    "Within the EconML SDK we use these estimators as final models for CATE estimation, such as in the case of `econml.dml.CausalForestDML`, where we combine a Causal Forest with Double Machine Learning, to residualize the treatment and outcome and call the `econml.grf.CausalForest` on the residuals. Similarly, the `econml.dr.ForestDRLearner` uses an `econml.grf.RegressionForest` as a final stage estimator on the doubly robust targets estimated by the first stage. The estimators here should primarily be used in conjunction with CateEstimators and not as standalone, but we provide here examples of their direct usage functionality.\n",
    "\n",
    "The EconML SDK implements the following Generalized Random Forest variants:\n",
    "\n",
--- a/notebooks/Interpretability
+++ b/notebooks/Interpretability
@ -41,9 +41,9 @@
   "outputs": [],
   "source": [
    "from econml.dml import CausalForestDML, LinearDML, NonParamDML\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
    "from econml.metalearners import DomainAdaptationLearner, XLearner\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
    "import numpy as np\n",
    "import scipy.special\n",
    "import matplotlib.pyplot as plt\n",
--- a/notebooks/Metalearners
+++ b/notebooks/Metalearners
@ -258,7 +258,7 @@
   ],
   "source": [
    "# Instantiate Doubly Robust Learner\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
    "outcome_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n",
    "pseudo_treatment_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n",
    "propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, \n",