diff --git a/README.md b/README.md
index bb2484ee..2a398ba1 100644
--- a/README.md
+++ b/README.md
@@ -162,7 +162,7 @@ To install from source, see [For Developers](#for-developers) section below.
   <summary>Orthogonal Random Forests (click to expand)</summary>
 
   ```Python
-  from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+  from econml.orf import DMLOrthoForest, DROrthoForest
   from econml.sklearn_extensions.linear_model import WeightedLasso, WeightedLassoCV
   # Use defaults
   est = DMLOrthoForest()
@@ -233,7 +233,7 @@ To install from source, see [For Developers](#for-developers) section below.
 * Linear final stage
 
 ```Python
-from econml.drlearner import LinearDRLearner
+from econml.dr import LinearDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 
 est = LinearDRLearner(model_propensity=GradientBoostingClassifier(),
@@ -246,7 +246,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Sparse linear final stage
 
 ```Python
-from econml.drlearner import SparseLinearDRLearner
+from econml.dr import SparseLinearDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 
 est = SparseLinearDRLearner(model_propensity=GradientBoostingClassifier(),
@@ -259,7 +259,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Nonparametric final stage
 
 ```Python
-from econml.drlearner import ForestDRLearner
+from econml.dr import ForestDRLearner
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 
 est = ForestDRLearner(model_propensity=GradientBoostingClassifier(),
@@ -276,7 +276,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05)
 * Intent to Treat Doubly Robust Learner (discrete instrument, discrete treatment)
 
 ```Python
-from econml.ortho_iv import LinearIntentToTreatDRIV
+from econml.iv.dr import LinearIntentToTreatDRIV
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
 from sklearn.linear_model import LinearRegression
 
@@ -295,7 +295,7 @@ lb, ub = est.effect_interval(X_test, alpha=0.05) # OLS confidence intervals
 
 ```Python
 import keras
-from econml.deepiv import DeepIVEstimator
+from econml.iv.nnet import DeepIV
 
 treatment_model = keras.Sequential([keras.layers.Dense(128, activation='relu', input_shape=(2,)),
                                     keras.layers.Dropout(0.17),
@@ -310,11 +310,11 @@ response_model = keras.Sequential([keras.layers.Dense(128, activation='relu', in
                                   keras.layers.Dense(32, activation='relu'),
                                   keras.layers.Dropout(0.17),
                                   keras.layers.Dense(1)])
-est = DeepIVEstimator(n_components=10, # Number of gaussians in the mixture density networks)
-                      m=lambda z, x: treatment_model(keras.layers.concatenate([z, x])), # Treatment model
-                      h=lambda t, x: response_model(keras.layers.concatenate([t, x])), # Response model
-                      n_samples=1 # Number of samples used to estimate the response
-                      )
+est = DeepIV(n_components=10, # Number of gaussians in the mixture density networks)
+             m=lambda z, x: treatment_model(keras.layers.concatenate([z, x])), # Treatment model
+             h=lambda t, x: response_model(keras.layers.concatenate([t, x])), # Response model
+             n_samples=1 # Number of samples used to estimate the response
+             )
 est.fit(Y, T, X=X, Z=Z) # Z -> instrumental variables
 treatment_effects = est.effect(X_test)
 ```
diff --git a/doc/conf.py b/doc/conf.py
index 82840b6b..70adcc42 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -212,7 +212,8 @@ epub_exclude_files = ['search.html']
 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
                        'numpy': ('https://docs.scipy.org/doc/numpy/', None),
                        'sklearn': ('https://scikit-learn.org/stable/', None),
-                       'matplotlib': ('https://matplotlib.org/', None)}
+                       'matplotlib': ('https://matplotlib.org/', None),
+                       'shap': ('https://shap.readthedocs.io/en/stable/', None)}
 
 # -- Options for todo extension ----------------------------------------------
 
diff --git a/doc/map.svg b/doc/map.svg
index 07906aad..937ab27e 100644
--- a/doc/map.svg
+++ b/doc/map.svg
@@ -94,24 +94,24 @@
         <path d="M9.50005 785.667C9.50005 771.768 20.7678 760.5 34.6674 760.5L369.333 760.5C383.232 760.5 394.5 771.768 394.5 785.667L394.5 886.333C394.5 900.232 383.232 911.5 369.333 911.5L34.6674 911.5C20.7678 911.5 9.50005 900.232 9.50005 886.333Z" stroke="#7030A0" stroke-width="1.33333" stroke-miterlimit="8" fill="#7030A0" fill-rule="evenodd" fill-opacity="0.341176"/>
         <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(111.984 896)">Exploit Explicit Randomization</text>
         <rect x="26" y="781" width="178" height="97" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.two_stage_least_squares.html" target="_parent">
+        <a href="_autosummary/econml.iv.sieve.SieveTSLS.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(93.7427 817)">2SLS IV</text>
         </a>
         <path d="M93.7427 819.138 114.743 819.138 135.743 819.138 135.743 820.138 114.743 820.138 93.7427 820.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.IntentToTreatDRIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.IntentToTreatDRIV" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(59.2427 834)">IntentToTreatDRIV</text>
         </a>
         <path d="M59.7427 836.138 96.7427 836.138 133.743 836.138 170.743 836.138 170.743 837.138 133.743 837.138 96.7427 837.138 59.7427 837.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.LinearIntentToTreatDRIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.LinearIntentToTreatDRIV.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(40.9094 852)">LinearIntentToTreatDRIV</text>
         </a>
         <path d="M40.7427 854.138 77.7427 854.138 114.743 854.138 151.743 854.138 188.743 854.138 188.743 855.138 151.743 855.138 114.743 855.138 77.7427 855.138 40.7427 855.138Z" fill="#FFFFFF" fill-rule="evenodd"/>
         <rect x="216" y="781" width="163" height="97" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.deepiv.html" target="_parent">
+        <a href="_autosummary/econml.iv.nnet.DeepIV.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(274.239 808)">Deep IV</text>
         </a>
         <path d="M274.659 810.338 297.659 810.338 320.659 810.338 320.659 811.338 297.659 811.338 274.659 811.338Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_iv.html#econml.ortho_iv.NonParamDMLIV" target="_parent">
+        <a href="_autosummary/econml.iv.dr.NonParamDMLIV.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(246.159 825)">NonParamDMLIV</text>
         </a>
         <path d="M246.659 827.338 280.659 827.338 314.659 827.338 348.659 827.338 348.659 828.338 314.659 828.338 280.659 828.338 246.659 828.338Z" fill="#FFFFFF" fill-rule="evenodd"/>
@@ -171,54 +171,54 @@
            <tspan x="31.92" y="17">Experimentation</tspan>
         </text>
         <rect x="470" y="461" width="226" height="62" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.SparseLinearDML" target="_parent">
+        <a href="_autosummary/econml.dml.SparseLinearDML.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(530 488)">SparseLinearDML</text>
         </a>
         <path d="M489.012 490.461 536.012 490.461 583.012 490.461 630.012 490.461 677.012 490.461 677.012 491.461 630.012 491.461 583.012 491.461 536.012 491.461 489.012 491.461Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.SparseLinearDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.SparseLinearDRLearner.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(513.845 505)">SparseLinearDRLearner</text>
         </a>
         <path d="M514.012 507.461 560.345 507.461 606.678 507.461 653.012 507.461 653.012 508.461 606.678 508.461 560.345 508.461 514.012 508.461Z" fill="#FFFFFF" fill-rule="evenodd"/>
         <rect x="471" y="549" width="226" height="51" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.LinearDML" target="_parent">
+        <a href="_autosummary/econml.dml.LinearDML.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(550 571)">LinearDML</text>
         </a>
         <path d="M509.292 573.095 558.959 573.095 608.626 573.095 658.292 573.095 658.292 574.095 608.626 574.095 558.959 574.095 509.292 574.095Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.LinearDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.LinearDRLearner.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(533.872 588)">LinearDRLearner</text>
         </a>
         <path d="M534.292 590.095 583.792 590.095 633.292 590.095 633.292 591.095 583.792 591.095 534.292 591.095Z" fill="#FFFFFF" fill-rule="evenodd"/>
         <rect x="471" y="614" width="226" height="90" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.ortho_forest.html#econml.ortho_forest.DMLOrthoForest" target="_parent">
+        <a href="_autosummary/econml.orf.DMLOrthoForest.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(535 638)">DMLOrthoForest</text>
         </a>
         <path d="M482.292 639.866 533.042 639.866 583.792 639.866 634.542 639.866 685.292 639.866 685.292 640.866 634.542 640.866 583.792 640.866 533.042 640.866 482.292 640.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.ortho_forest.html#econml.ortho_forest.DROrthoForest" target="_parent">
+        <a href="_autosummary/econml.orf.DROrthoForest.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(540 655)">DROrthoForest</text>
         </a>
         <path d="M491.292 656.866 537.292 656.866 583.292 656.866 629.292 656.866 675.292 656.866 675.292 657.866 629.292 657.866 583.292 657.866 537.292 657.866 491.292 657.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.ForestDRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.ForestDRLearner.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(533.539 673)">ForestDRLearner</text>
         </a>
         <path d="M533.292 674.866 583.292 674.866 633.292 674.866 633.292 675.866 583.292 675.866 533.292 675.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.CausalForestDML" target="_parent">
+        <a href="_autosummary/econml.dml.CausalForestDML.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(531 691)">CausalForestDML</text>
         </a>
         <path d="M509.292 692.866 558.959 692.866 608.626 692.866 658.292 692.866 658.292 693.866 608.626 693.866 558.959 693.866 509.292 693.866Z" fill="#FFFFFF" fill-rule="evenodd"/>
         <rect x="471" y="716" width="226" height="131" stroke="#FFFFFF" stroke-width="2" stroke-miterlimit="8" fill="#70AD47"/>
-        <a href="_autosummary/econml.metalearners.html" target="_parent">
+        <a href="reference.html#meta-learners" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(542.042 752)">MetaLearners</text>
         </a>
         <path d="M542.296 753.513 583.796 753.513 625.296 753.513 625.296 754.513 583.796 754.513 542.296 754.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.drlearner.html#econml.drlearner.DRLearner" target="_parent">
+        <a href="_autosummary/econml.dr.DRLearner.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(552.209 769)">DRLearner</text>
         </a>
         <path d="M552.296 770.513 583.796 770.513 615.296 770.513 615.296 771.513 583.796 771.513 552.296 771.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.DML" target="_parent">
+        <a href="_autosummary/econml.dml.DML.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(570 787)">DML</text>
         </a>
         <path d="M527.296 788.513 564.629 788.513 601.962 788.513 639.296 788.513 639.296 789.513 601.962 789.513 564.629 789.513 527.296 789.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
-        <a href="_autosummary/econml.dml.html#econml.dml.NonParamDML" target="_parent">
+        <a href="_autosummary/econml.dml.NonParamDML.html" target="_parent">
             <text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="15" transform="translate(536 805)">NonParamDML</text>
         </a>
         <path d="M496.296 806.513 540.046 806.513 583.796 806.513 627.546 806.513 671.296 806.513 671.296 807.513 627.546 807.513 583.796 807.513 540.046 807.513 496.296 807.513Z" fill="#FFFFFF" fill-rule="evenodd"/>
diff --git a/doc/reference.rst b/doc/reference.rst
index d77de3d1..b82f1631 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -1,21 +1,232 @@
 Public Module Reference
 =======================
 
+CATE Estimators
+---------------
+
+.. _dml_api:
+
+Double Machine Learning (DML)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 .. autosummary::
     :toctree: _autosummary
 
-    econml.bootstrap
-    econml.cate_interpreter
-    econml.deepiv
-    econml.dml
-    econml.drlearner
-    econml.grf
-    econml.inference
-    econml.metalearners
-    econml.ortho_forest
-    econml.ortho_iv
-    econml.score
-    econml.two_stage_least_squares
+    econml.dml.DML
+    econml.dml.LinearDML
+    econml.dml.SparseLinearDML
+    econml.dml.CausalForestDML
+    econml.dml.NonParamDML
+
+.. _dr_api:
+
+Doubly Robust (DR)
+^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.dr.DRLearner
+    econml.dr.LinearDRLearner
+    econml.dr.SparseLinearDRLearner
+    econml.dr.ForestDRLearner
+
+.. _metalearners_api:
+
+Meta-Learners
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.metalearners.XLearner
+    econml.metalearners.TLearner
+    econml.metalearners.SLearner
+    econml.metalearners.DomainAdaptationLearner
+
+.. _orf_api:
+
+Orthogonal Random Forest (ORF)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.orf.DMLOrthoForest
+    econml.orf.DROrthoForest
+
+Instrumental Variable CATE Estimators
+-------------------------------------
+
+.. _dmliv_api:
+
+Double Machine Learning (DML) IV
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.dml.DMLATEIV
+    econml.iv.dml.ProjectedDMLATEIV
+    econml.iv.dml.DMLIV
+    econml.iv.dml.NonParamDMLIV
+
+.. _driv_api:
+
+Doubly Robust (DR) IV
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.dr.IntentToTreatDRIV
+    econml.iv.dr.LinearIntentToTreatDRIV
+
+.. _deepiv_api:
+
+DeepIV
+^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.nnet.DeepIV
+
+.. _tsls_api:
+
+Sieve Methods
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.iv.sieve.SieveTSLS
+    econml.iv.sieve.HermiteFeatures
+    econml.iv.sieve.DPolynomialFeatures
+
+
+.. _interpreters_api:
+
+CATE Interpreters
+-----------------
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.cate_interpreter.SingleTreeCateInterpreter
+    econml.cate_interpreter.SingleTreePolicyInterpreter
+
+.. _scorers_api:
+
+CATE Scorers
+------------
+
+.. autosummary::
+    :toctree: _autosummary
+    
+    econml.score.RScorer
+    econml.score.EnsembleCateEstimator
+
+
+.. _grf_api:
+
+Generalized Random Forests
+--------------------------
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.grf.CausalForest
+    econml.grf.CausalIVForest
+    econml.grf.RegressionForest
+    econml.grf.MultiOutputGRF
+    econml.grf.LinearMomentGRFCriterion
+    econml.grf.LinearMomentGRFCriterionMSE
+    econml.grf._base_grf.BaseGRF
+    econml.grf._base_grftree.GRFTree
+
+
+.. Integration with AzureML AutoML
+.. -------------------------------
+
+.. .. autosummary::
+..     :toctree: _autosummary
+
+..     econml.automated_ml
+
+Scikit-Learn Extensions
+-----------------------
+
+.. _sklearn_linear_api:
+
+Linear Model Extensions
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.sklearn_extensions.linear_model.DebiasedLasso
+    econml.sklearn_extensions.linear_model.MultiOutputDebiasedLasso
+    econml.sklearn_extensions.linear_model.SelectiveRegularization
+    econml.sklearn_extensions.linear_model.StatsModelsLinearRegression
+    econml.sklearn_extensions.linear_model.StatsModelsRLM
+    econml.sklearn_extensions.linear_model.WeightedLasso
+    econml.sklearn_extensions.linear_model.WeightedLassoCV
+    econml.sklearn_extensions.linear_model.WeightedMultiTaskLassoCV
+    econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper
+
+.. _sklearn_model_api:
+
+Model Selection Extensions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.sklearn_extensions.model_selection.GridSearchCVList
+    econml.sklearn_extensions.model_selection.WeightedKFold
+    econml.sklearn_extensions.model_selection.WeightedStratifiedKFold
+
+
+.. _inference_api:
+
+Inference
+---------
+
+Inference Results
+^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.inference.NormalInferenceResults
+    econml.inference.EmpiricalInferenceResults
+    econml.inference.PopulationSummaryResults
+
+Inference Methods
+^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.inference.BootstrapInference
+    econml.inference.GenericModelFinalInference
+    econml.inference.GenericSingleTreatmentModelFinalInference
+    econml.inference.LinearModelFinalInference
+    econml.inference.StatsModelsInference
+    econml.inference.GenericModelFinalInferenceDiscrete
+    econml.inference.LinearModelFinalInferenceDiscrete
+    econml.inference.StatsModelsInferenceDiscrete
+
+
+.. _utilities_api:
+
+Utilities
+---------
+
+.. autosummary::
+    :toctree: _autosummary
+    
     econml.utilities
 
 Private Module Reference
@@ -26,18 +237,5 @@ Private Module Reference
 
     econml._ortho_learner
     econml._cate_estimator
-    econml._causal_tree
-    econml._shap
     econml.dml._rlearner
-    econml.grf._base_grf
-    econml.grf._base_grftree
-    econml.grf._criterion
-
-Scikit-Learn Extensions
-=======================
-
-.. autosummary::
-    :toctree: _autosummary
-
-    econml.sklearn_extensions.linear_model
-    econml.sklearn_extensions.model_selection
+    econml.inference._bootstrap
diff --git a/doc/spec/comparison.rst b/doc/spec/comparison.rst
index c9a2c805..24906d20 100644
--- a/doc/spec/comparison.rst
+++ b/doc/spec/comparison.rst
@@ -9,7 +9,7 @@ Detailed estimator comparison
 +=============================================+==============+==============+==================+=============+=================+============+==============+====================+
 | :class:`.NonparametricTwoStageLeastSquares` | Any          | Yes          |                  | Yes         | Assumed         | Yes        | Yes          |                    |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
-| :class:`.DeepIVEstimator`                   | Any          | Yes          |                  |             |                 | Yes        | Yes          |                    |
+| :class:`.DeepIV`                            | Any          | Yes          |                  |             |                 | Yes        | Yes          |                    |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.SparseLinearDML`                   | Any          |              | Yes              | Yes         | Assumed         | Yes        | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
@@ -27,7 +27,7 @@ Detailed estimator comparison
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.DROrthoForest`                     | Categorical  |              | Yes              |             |                 |            | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
-| :mod:`~econml.metalearners`                 | Categorical  |              |                  |             |                 | Yes        | Yes          | Yes                |
+| :ref:`metalearners <metalearners_api>`      | Categorical  |              |                  |             |                 | Yes        | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
 | :class:`.DRLearner`                         | Categorical  |              |                  |             |                 |            | Yes          | Yes                |
 +---------------------------------------------+--------------+--------------+------------------+-------------+-----------------+------------+--------------+--------------------+
diff --git a/doc/spec/estimation/dml.rst b/doc/spec/estimation/dml.rst
index edc1a319..4b4d8efd 100644
--- a/doc/spec/estimation/dml.rst
+++ b/doc/spec/estimation/dml.rst
@@ -445,7 +445,8 @@ Usage FAQs
 
     Alternatively, you can pick the best first stage models outside of the EconML framework and pass in the selected models to EconML. 
     This can save on runtime and computational resources. Furthermore, it is statistically more stable since all data is being used for
-    training rather than a fold. E.g.:
+    hyper-parameter tuning rather than a single fold inside of the DML algorithm (as long as the number of hyperparameter values
+    that you are selecting over is not exponential in the number of samples, this approach is statistically valid). E.g.:
 
     .. testcode::
 
@@ -723,7 +724,6 @@ lightning package implements such a class::
     from econml.dml import DML
     from sklearn.preprocessing import PolynomialFeatures
     from lightning.regression import FistaRegressor
-    from econml.bootstrap import BootstrapEstimator
     from sklearn.linear_model import MultiTaskElasticNet
 
     est = DML(model_y=MultiTaskElasticNet(alpha=0.1),
diff --git a/doc/spec/estimation/dr.rst b/doc/spec/estimation/dr.rst
index 76d6c4ae..9910a93a 100644
--- a/doc/spec/estimation/dr.rst
+++ b/doc/spec/estimation/dr.rst
@@ -68,7 +68,7 @@ characteristics :math:`X` of the treated samples, then one can use this method.
 
 .. testcode::
 
-    from econml.drlearner import LinearDRLearner
+    from econml.dr import LinearDRLearner
     est = LinearDRLearner()
     est.fit(y, T, X=X, W=W)
     est.effect(X, T0=t0, T1=t1)
@@ -195,7 +195,7 @@ is chosen for the final stage. The user can choose any regression/classification
 in all these variants. The hierarchy
 structure of the implemented CATE estimators is as follows.
 
-    .. inheritance-diagram:: econml.drlearner.DRLearner econml.drlearner.LinearDRLearner econml.drlearner.SparseLinearDRLearner econml.drlearner.ForestDRLearner
+    .. inheritance-diagram:: econml.dr.DRLearner econml.dr.LinearDRLearner econml.dr.SparseLinearDRLearner econml.dr.ForestDRLearner
         :parts: 1
         :private-bases:
         :top-classes: econml._ortho_learner._OrthoLearner, econml._cate_estimator.StatsModelsCateEstimatorDiscreteMixin, econml._cate_estimator.DebiasedLassoCateEstimatorDiscreteMixin
@@ -209,7 +209,7 @@ Below we give a brief description of each of these classes:
 
       .. testcode::
 
-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
         est = DRLearner(model_regression=GradientBoostingRegressor(),
                         model_propensity=GradientBoostingClassifier(),
@@ -224,7 +224,7 @@ Below we give a brief description of each of these classes:
 
       .. testcode::
 
-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
         from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
         from sklearn.model_selection import GridSearchCV
         model_reg = lambda: GridSearchCV(
@@ -260,7 +260,7 @@ Below we give a brief description of each of these classes:
 
           .. testcode::
 
-            from econml.drlearner import LinearDRLearner
+            from econml.dr import LinearDRLearner
             est = LinearDRLearner()
             est.fit(y, T, X=X, W=W)
             point = est.effect(X, T1=t1)
@@ -281,7 +281,7 @@ Below we give a brief description of each of these classes:
 
           .. testcode::
 
-            from econml.drlearner import SparseLinearDRLearner
+            from econml.dr import SparseLinearDRLearner
             est = SparseLinearDRLearner()
             est.fit(y, T, X=X, W=W)
             point = est.effect(X, T1=T1)
@@ -292,13 +292,13 @@ Below we give a brief description of each of these classes:
 
         - **ForestDRLearner.** The child class :class:`.ForestDRLearner` uses a Subsampled Honest Forest regressor
           as a final model (see [Wager2018]_ and [Athey2019]_). The subsampled honest forest is implemented in our library as a scikit-learn extension
-          of the :class:`~sklearn.ensemble.RandomForestRegressor`, in the class :class:`.SubsampledHonestForest`. This estimator
+          of the :class:`~sklearn.ensemble.RandomForestRegressor`, in the class :class:`~econml.grf.RegressionForest`. This estimator
           offers confidence intervals via the Bootstrap-of-Little-Bags as described in [Athey2019]_.
           Using this functionality we can also construct confidence intervals for the CATE:
 
           .. testcode::
 
-            from econml.drlearner import ForestDRLearner
+            from econml.dr import ForestDRLearner
             from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
             est = ForestDRLearner(model_regression=GradientBoostingRegressor(),
                                   model_propensity=GradientBoostingClassifier())
@@ -323,7 +323,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import LinearDRLearner
+        from econml.dr import LinearDRLearner
         est = LinearDRLearner()
         est.fit(y, T, X=X, W=W)
         lb, ub = est.const_marginal_effect_interval(X, alpha=.05)
@@ -341,7 +341,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import SparseLinearDRLearner
+        from econml.dr import SparseLinearDRLearner
         from sklearn.preprocessing import PolynomialFeatures
         est = SparseLinearDRLearner(featurizer=PolynomialFeatures(degree=3, include_bias=False))
         est.fit(y, T, X=X, W=W)
@@ -355,7 +355,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import ForestDRLearner
+        from econml.dr import ForestDRLearner
         from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
         est = ForestDRLearner(model_regression=GradientBoostingRegressor(),
                               model_propensity=GradientBoostingClassifier())
@@ -383,7 +383,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import SparseLinearDRLearner
+        from econml.dr import SparseLinearDRLearner
         from sklearn.linear_model import LassoCV, LogisticRegressionCV, ElasticNetCV
         from sklearn.ensemble import GradientBoostingRegressor
         est = SparseLinearDRLearner(model_regression=LassoCV(),
@@ -409,7 +409,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
         from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
         from sklearn.model_selection import GridSearchCV
         model_reg = lambda: GridSearchCV(
@@ -433,7 +433,8 @@ Usage FAQs
 
     Alternatively, you can pick the best first stage models outside of the EconML framework and pass in the selected models to EconML. 
     This can save on runtime and computational resources. Furthermore, it is statistically more stable since all data is being used for
-    training rather than a fold. E.g.:
+    hyper-parameter tuning rather than a single fold inside of the DML algorithm (as long as the number of hyperparameter values
+    that you are selecting over is not exponential in the number of samples, this approach is statistically valid). E.g.:
 
     .. testcode::
 
@@ -475,7 +476,7 @@ Usage FAQs
 
     .. testcode::
 
-        from econml.drlearner import DRLearner
+        from econml.dr import DRLearner
         from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
         est = DRLearner(model_regression=RandomForestRegressor(oob_score=True),
                         model_propensity=RandomForestClassifier(min_samples_leaf=10, oob_score=True),
diff --git a/doc/spec/estimation/forest.rst b/doc/spec/estimation/forest.rst
index cbd8667b..48a4876d 100644
--- a/doc/spec/estimation/forest.rst
+++ b/doc/spec/estimation/forest.rst
@@ -165,7 +165,7 @@ some extensions to the scikit-learn library that enable sample weights, such as
     .. testcode:: intro
         :hide:
 
-        from econml.ortho_forest import DMLOrthoForest
+        from econml.orf import DMLOrthoForest
         from econml.sklearn_extensions.linear_model import WeightedLasso
 
     .. doctest:: intro
@@ -303,10 +303,10 @@ sample :math:`X_i`. This is implemented in the RegressionForest (see :class:`.Re
 Class Hierarchy Structure
 =========================
 
-.. inheritance-diagram:: econml.ortho_forest.DMLOrthoForest econml.ortho_forest.DROrthoForest econml.drlearner.ForestDRLearner econml.dml.CausalForestDML
+.. inheritance-diagram:: econml.orf.DMLOrthoForest econml.orf.DROrthoForest econml.dr.ForestDRLearner econml.dml.CausalForestDML
         :parts: 1
         :private-bases:
-        :top-classes: econml._ortho_learner._OrthoLearner, econml.ortho_forest.BaseOrthoForest, econml._cate_estimator.LinearCateEstimator
+        :top-classes: econml._ortho_learner._OrthoLearner, econml.orf.BaseOrthoForest, econml._cate_estimator.LinearCateEstimator
 
 
 Usage Examples
@@ -323,7 +323,7 @@ and the `ForestLearners Jupyter notebook <https://github.com/microsoft/EconML/bl
 
         import numpy as np
         import sklearn
-        from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+        from econml.orf import DMLOrthoForest, DROrthoForest
         np.random.seed(123)
 
     >>> T = np.array([0, 1]*60)
@@ -333,7 +333,7 @@ and the `ForestLearners Jupyter notebook <https://github.com/microsoft/EconML/bl
     ...                      model_T=sklearn.linear_model.LinearRegression(),
     ...                      model_Y=sklearn.linear_model.LinearRegression())
     >>> est.fit(Y, T, X=W, W=W)
-    <econml.ortho_forest.DMLOrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DMLOrthoForest object at 0x...>
     >>> print(est.effect(W[:2]))
     [1.00...  1.19...]
 
@@ -346,7 +346,7 @@ Similarly, we can call :class:`.DROrthoForest`:
     ...                     propensity_model=sklearn.linear_model.LogisticRegression(),
     ...                     model_Y=sklearn.linear_model.LinearRegression())
     >>> est.fit(Y, T, X=W, W=W)
-    <econml.ortho_forest.DROrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DROrthoForest object at 0x...>
     >>> print(est.effect(W[:2]))
     [0.99...  1.35...]
 
@@ -355,8 +355,8 @@ and with more realistic noisy data. In this case we can just use the default par
 of the class, which specify the use of the :class:`~sklearn.linear_model.LassoCV` for 
 both the treatment and the outcome regressions, in the case of continuous treatments.
 
-    >>> from econml.ortho_forest import DMLOrthoForest
-    >>> from econml.ortho_forest import DMLOrthoForest
+    >>> from econml.orf import DMLOrthoForest
+    >>> from econml.orf import DMLOrthoForest
     >>> from econml.sklearn_extensions.linear_model import WeightedLasso
     >>> import matplotlib.pyplot as plt
     >>> np.random.seed(123)
@@ -370,7 +370,7 @@ both the treatment and the outcome regressions, in the case of continuous treatm
     ...                      model_Y=WeightedLasso(alpha=0.01),
     ...                      model_T=WeightedLasso(alpha=0.01))
     >>> est.fit(Y, T, X=X, W=W)
-    <econml.ortho_forest.DMLOrthoForest object at 0x...>
+    <econml.orf._ortho_forest.DMLOrthoForest object at 0x...>
     >>> X_test = np.linspace(-1, 1, 30).reshape(-1, 1)
     >>> treatment_effects = est.effect(X_test)
     >>> plt.plot(X_test[:, 0], treatment_effects, label='ORF estimate')
diff --git a/doc/spec/estimation/metalearners.rst b/doc/spec/estimation/metalearners.rst
index e3cf21c3..34f4bc67 100644
--- a/doc/spec/estimation/metalearners.rst
+++ b/doc/spec/estimation/metalearners.rst
@@ -135,7 +135,7 @@ See :ref:`Double Machine Learning User Guid <dmluserguide>`.
 Class Hierarchy Structure
 ==================================
 
-.. inheritance-diagram:: econml.metalearners.SLearner econml.metalearners.TLearner econml.metalearners.XLearner econml.metalearners.DomainAdaptationLearner econml.drlearner.DRLearner econml.dml.DML
+.. inheritance-diagram:: econml.metalearners.SLearner econml.metalearners.TLearner econml.metalearners.XLearner econml.metalearners.DomainAdaptationLearner econml.dr.DRLearner econml.dml.DML
         :parts: 1
         :private-bases:
         :top-classes: econml._ortho_learner._OrthoLearner, econml._cate_estimator.LinearCateEstimator, econml._cate_estimator.TreatmentExpansionMixin
diff --git a/doc/spec/estimation/two_sls.rst b/doc/spec/estimation/two_sls.rst
index fd4e61bc..2a6edd22 100644
--- a/doc/spec/estimation/two_sls.rst
+++ b/doc/spec/estimation/two_sls.rst
@@ -2,7 +2,7 @@
 Sieve 2SLS Instrumental Variable Estimation
 ===========================================
 
-The sieve based instrumental variable module is based on a two-stage least squares estimation procedure.
+The sieve based instrumental variable estimator :class:`.SieveTSLS` is based on a two-stage least squares estimation procedure.
 The user must specify the sieve basis for :math:`T`, :math:`X` and :math:`Y` (Hermite polynomial or a set of indicator 
 functions), and the number of elements of the basis expansion to include. Formally, we now assume that we can write:
 
diff --git a/doc/spec/inference.rst b/doc/spec/inference.rst
index acdf50ed..4c597a41 100644
--- a/doc/spec/inference.rst
+++ b/doc/spec/inference.rst
@@ -60,7 +60,7 @@ This for instance holds for the :class:`.LinearDML` and the :class:`.LinearDRLea
 
 .. testcode::
 
-    from econml.drlearner import LinearDRLearner
+    from econml.dr import LinearDRLearner
     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
     est = LinearDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                           model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@@ -92,7 +92,7 @@ explicitly setting ``inference='debiasedlasso'``, e.g.:
 
 .. testcode::
 
-    from econml.drlearner import SparseLinearDRLearner
+    from econml.dr import SparseLinearDRLearner
     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
     est = SparseLinearDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                                 model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@@ -126,7 +126,7 @@ or by explicitly setting ``inference='blb'``, e.g.:
 
 .. testcode::
 
-    from econml.drlearner import ForestDRLearner
+    from econml.dr import ForestDRLearner
     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
     est = ForestDRLearner(model_regression=RandomForestRegressor(n_estimators=10, min_samples_leaf=10),
                           model_propensity=RandomForestClassifier(n_estimators=10, min_samples_leaf=10))
@@ -134,7 +134,7 @@ or by explicitly setting ``inference='blb'``, e.g.:
     point = est.effect(X)
     lb, ub = est.effect_interval(X, alpha=0.05)
 
-This inference is enabled by our implementation of the :class:`.SubsampledHonestForest` extension to the scikit-learn
+This inference is enabled by our implementation of the :class:`~econml.grf.RegressionForest` extension to the scikit-learn
 :class:`~sklearn.ensemble.RandomForestRegressor`.
 
 
@@ -148,7 +148,7 @@ inference at its default setting of ``'auto'`` or by explicitly setting ``infere
 
 .. testcode::
 
-    from econml.ortho_forest import DMLOrthoForest
+    from econml.orf import DMLOrthoForest
     from econml.sklearn_extensions.linear_model import WeightedLasso
     est = DMLOrthoForest(n_trees=10,
                          min_leaf_size=3,
diff --git a/econml/__init__.py b/econml/__init__.py
index 1d52f074..deadc2b9 100644
--- a/econml/__init__.py
+++ b/econml/__init__.py
@@ -1,7 +1,10 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
 
 __all__ = ['automated_ml', 'bootstrap',
            'cate_interpreter', 'causal_forest',
-           'data', 'deepiv', 'dml', 'drlearner', 'inference',
-           'metalearners', 'ortho_forest', 'ortho_iv',
+           'data', 'deepiv', 'dml', 'dr', 'drlearner',
+           'inference', 'iv',
+           'metalearners', 'ortho_forest', 'orf', 'ortho_iv',
            'score', 'sklearn_extensions', 'tree',
            'two_stage_least_squares', 'utilities']
diff --git a/econml/automated_ml/__init__.py b/econml/automated_ml/__init__.py
new file mode 100644
index 00000000..9914c36e
--- /dev/null
+++ b/econml/automated_ml/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._automated_ml import (setAutomatedMLWorkspace, addAutomatedML,
+                            AutomatedMLModel, AutomatedMLMixin, EconAutoMLConfig)
+
+__all__ = ["setAutomatedMLWorkspace",
+           "addAutomatedML",
+           "AutomatedMLModel",
+           "AutomatedMLMixin",
+           "EconAutoMLConfig"]
diff --git a/econml/automated_ml.py b/econml/automated_ml/_automated_ml.py
similarity index 99%
rename from econml/automated_ml.py
rename to econml/automated_ml/_automated_ml.py
index 600fcb20..629f3d67 100644
--- a/econml/automated_ml.py
+++ b/econml/automated_ml/_automated_ml.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 # AzureML
 from azureml.core.experiment import Experiment
 from azureml.core import Workspace
diff --git a/econml/bootstrap.py b/econml/bootstrap.py
index 711b475c..84b1a139 100644
--- a/econml/bootstrap.py
+++ b/econml/bootstrap.py
@@ -1,281 +1,17 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Bootstrap sampling."""
-import numpy as np
-from joblib import Parallel, delayed
-from sklearn.base import clone
-from scipy.stats import norm
-from collections import OrderedDict
-import pandas as pd
-
-
-class BootstrapEstimator:
-    """Estimator that uses bootstrap sampling to wrap an existing estimator.
-
-    This estimator provides a `fit` method with the same signature as the wrapped estimator.
-
-    The bootstrap estimator will also wrap all other methods and attributes of the wrapped estimator,
-    but return the average of the sampled calculations (this will fail for non-numeric outputs).
-
-    It will also provide a wrapper method suffixed with `_interval` for each method or attribute of
-    the wrapped estimator that takes two additional optional keyword arguments `lower` and `upper` specifiying
-    the percentiles of the interval, and which uses `np.percentile` to return the corresponding lower
-    and upper bounds based on the sampled calculations.  For example, if the underlying estimator supports
-    an `effect` method with signature `(X,T) -> Y`, this class will provide a method `effect_interval`
-    with pseudo-signature `(lower=5, upper=95, X, T) -> (Y, Y)` (where `lower` and `upper` cannot be
-    supplied as positional arguments).
-
-    Parameters
-    ----------
-    wrapped : object
-        The basis for the clones used for estimation.
-        This object must support a `fit` method which takes numpy arrays with consistent first dimensions
-        as arguments.
-
-    n_bootstrap_samples : int
-        How many draws to perform.
-
-    n_jobs: int, default: None
-        The maximum number of concurrently running jobs, as in joblib.Parallel.
-
-    compute_means : bool, default: True
-        Whether to pass calls through to the underlying collection and return the mean.  Setting this
-        to ``False`` can avoid ambiguities if the wrapped object itself has method names with an `_interval` suffix.
-
-    bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
-        Bootstrap method used to compute results.  'percentile' will result in using the empiracal CDF of
-        the replicated computations of the statistics.   'pivot' will also use the replicates but create a pivot
-        interval that also relies on the estimate over the entire dataset.  'normal' will instead compute an interval
-        assuming the replicates are normally distributed.
-    """
-
-    def __init__(self, wrapped, n_bootstrap_samples=1000, n_jobs=None, compute_means=True, bootstrap_type='pivot'):
-        self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
-        self._n_bootstrap_samples = n_bootstrap_samples
-        self._n_jobs = n_jobs
-        self._compute_means = compute_means
-        self._bootstrap_type = bootstrap_type
-        self._wrapped = wrapped
-
-    # TODO: Add a __dir__ implementation?
-
-    @staticmethod
-    def __stratified_indices(arr):
-        assert 1 <= np.ndim(arr) <= 2
-        unique = np.unique(arr, axis=0)
-        indices = []
-        for el in unique:
-            ind, = np.where(np.all(arr == el, axis=1) if np.ndim(arr) == 2 else arr == el)
-            indices.append(ind)
-        return indices
-
-    def fit(self, *args, **named_args):
-        """
-        Fit the model.
-
-        The full signature of this method is the same as that of the wrapped object's `fit` method.
-        """
-        from ._cate_estimator import BaseCateEstimator  # need to nest this here to avoid circular import
-
-        index_chunks = None
-        if isinstance(self._instances[0], BaseCateEstimator):
-            index_chunks = self._instances[0]._strata(*args, **named_args)
-            if index_chunks is not None:
-                index_chunks = self.__stratified_indices(index_chunks)
-        if index_chunks is None:
-            n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0]
-            index_chunks = [np.arange(n_samples)]  # one chunk with all indices
-
-        indices = []
-        for chunk in index_chunks:
-            n_samples = len(chunk)
-            indices.append(chunk[np.random.choice(n_samples,
-                                                  size=(self._n_bootstrap_samples, n_samples),
-                                                  replace=True)])
-
-        indices = np.hstack(indices)
-
-        def fit(x, *args, **kwargs):
-            x.fit(*args, **kwargs)
-            return x  # Explicitly return x in case fit fails to return its target
-
-        def convertArg(arg, inds):
-            if arg is None:
-                return None
-            arr = np.asarray(arg)
-            if arr.ndim > 0:
-                return arr[inds]
-            else:  # arg was a scalar, so we shouldn't have converted it
-                return arg
-
-        self._instances = Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
-            delayed(fit)(obj,
-                         *[convertArg(arg, inds) for arg in args],
-                         **{arg: convertArg(named_args[arg], inds) for arg in named_args})
-            for obj, inds in zip(self._instances, indices)
-        )
-        return self
-
-    def __getattr__(self, name):
-        """
-        Get proxy attribute that wraps the corresponding attribute with the same name from the wrapped object.
-
-        Additionally, the suffix "_interval" is supported for getting an interval instead of a point estimate.
-        """
-
-        # don't proxy special methods
-        if name.startswith('__'):
-            raise AttributeError(name)
-
-        def proxy(make_call, name, summary):
-            def summarize_with(f):
-                results = np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=3)(
-                    (f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name)
-                return summary(*results)
-            if make_call:
-                def call(*args, **kwargs):
-                    return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
-                return call
-            else:
-                return summarize_with(lambda obj, name: getattr(obj, name))
-
-        def get_mean():
-            # for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
-            return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))
-
-        def get_std():
-            prefix = name[: - len('_std')]
-            return proxy(callable(getattr(self._instances[0], prefix)), prefix,
-                         lambda arr, _: np.std(arr, axis=0))
-
-        def get_interval():
-            # if the attribute exists on the wrapped object once we remove the suffix,
-            # then we should be computing a confidence interval for the wrapped calls
-            prefix = name[: - len("_interval")]
-
-            def call_with_bounds(can_call, lower, upper):
-                def percentile_bootstrap(arr, _):
-                    return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)
-
-                def pivot_bootstrap(arr, est):
-                    return 2 * est - np.percentile(arr, upper, axis=0), 2 * est - np.percentile(arr, lower, axis=0)
-
-                def normal_bootstrap(arr, est):
-                    std = np.std(arr, axis=0)
-                    return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
-
-                # TODO: studentized bootstrap? this would be more accurate in most cases but can we avoid
-                #       second level bootstrap which would be prohibitive computationally?
-
-                fn = {'percentile': percentile_bootstrap,
-                      'normal': normal_bootstrap,
-                      'pivot': pivot_bootstrap}[self._bootstrap_type]
-                return proxy(can_call, prefix, fn)
-
-            can_call = callable(getattr(self._instances[0], prefix))
-            if can_call:
-                # collect extra arguments and pass them through, if the wrapped attribute was callable
-                def call(*args, lower=5, upper=95, **kwargs):
-                    return call_with_bounds(can_call, lower, upper)(*args, **kwargs)
-                return call
-            else:
-                # don't pass extra arguments if the wrapped attribute wasn't callable to begin with
-                def call(lower=5, upper=95):
-                    return call_with_bounds(can_call, lower, upper)
-                return call
-
-        def get_inference():
-            # can't import from econml.inference at top level without creating cyclical dependencies
-            from .inference import EmpiricalInferenceResults, NormalInferenceResults
-            from ._cate_estimator import LinearModelFinalCateEstimatorDiscreteMixin
-
-            prefix = name[: - len("_inference")]
-
-            def fname_transformer(x):
-                return x
-
-            if prefix in ['const_marginal_effect', 'marginal_effect', 'effect']:
-                inf_type = 'effect'
-            elif prefix == 'coef_':
-                inf_type = 'coefficient'
-                if (hasattr(self._instances[0], 'cate_feature_names') and
-                        callable(self._instances[0].cate_feature_names)):
-                    def fname_transformer(x):
-                        return self._instances[0].cate_feature_names(x)
-            elif prefix == 'intercept_':
-                inf_type = 'intercept'
-            else:
-                raise AttributeError("Unsupported inference: " + name)
-
-            d_t = self._wrapped._d_t[0] if self._wrapped._d_t else 1
-            if prefix == 'effect' or (isinstance(self._wrapped, LinearModelFinalCateEstimatorDiscreteMixin) and
-                                      (inf_type == 'coefficient' or inf_type == 'intercept')):
-                d_t = 1
-            d_y = self._wrapped._d_y[0] if self._wrapped._d_y else 1
-
-            can_call = callable(getattr(self._instances[0], prefix))
-
-            kind = self._bootstrap_type
-            if kind == 'percentile' or kind == 'pivot':
-                def get_dist(est, arr):
-                    if kind == 'percentile':
-                        return arr
-                    elif kind == 'pivot':
-                        return 2 * est - arr
-                    else:
-                        raise ValueError("Invalid kind, must be either 'percentile' or 'pivot'")
-
-                def get_result():
-                    return proxy(can_call, prefix,
-                                 lambda arr, est: EmpiricalInferenceResults(
-                                     d_t=d_t, d_y=d_y,
-                                     pred=est, pred_dist=get_dist(est, arr),
-                                     inf_type=inf_type,
-                                     fname_transformer=fname_transformer,
-                                     **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None))
-
-                # Note that inference results are always methods even if the inference is for a property
-                # (e.g. coef__inference() is a method but coef_ is a property)
-                # Therefore we must insert a lambda if getting inference for a non-callable
-                return get_result() if can_call else get_result
-
-            else:
-                assert kind == 'normal'
-
-                def normal_inference(*args, **kwargs):
-                    pred = getattr(self._wrapped, prefix)
-                    if can_call:
-                        pred = pred(*args, **kwargs)
-                    stderr = getattr(self, prefix + '_std')
-                    if can_call:
-                        stderr = stderr(*args, **kwargs)
-                    return NormalInferenceResults(
-                        d_t=d_t, d_y=d_y, pred=pred,
-                        pred_stderr=stderr, inf_type=inf_type,
-                        fname_transformer=fname_transformer,
-                        **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None)
-
-                # If inference is for a property, create a fresh lambda to avoid passing args through
-                return normal_inference if can_call else lambda: normal_inference()
-
-        caught = None
-        m = None
-        if name.endswith("_interval"):
-            m = get_interval
-        elif name.endswith("_std"):
-            m = get_std
-        elif name.endswith("_inference"):
-            m = get_inference
-
-        # try to get interval/std first if appropriate,
-        # since we don't prefer a wrapped method with this name
-        if m is not None:
-            try:
-                return m()
-            except AttributeError as err:
-                caught = err
-        if self._compute_means:
-            return get_mean()
-
-        raise (caught if caught else AttributeError(name))
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.inference._bootstrap as bootstrap
+from .utilities import deprecated
+
+
+@deprecated("The econml.bootstrap.BootstrapEstimator class has been moved "
+            "to econml.inference._bootstrap.BootstrapEstimator and is no longer part of the public API; "
+            "an upcoming release will remove support for the old name and will consider `BootstrapEstimator` "
+            "as part of the private API with no guarantee of API consistency across releases. "
+            "Instead of wrapping CATE esitmators with the BootstrapEstimator to preduce bootstrap confidence "
+            "intervals, consider passing `inference='bootstrap'` or "
+            "`inference=econml.inference.BootstrapInference(n_bootstrap_samples=..., bootstrap_type=...)`, "
+            "as a keyword argument at the `fit` method of the CATE estimator.")
+class BootstrapEstimator(bootstrap.BootstrapEstimator):
+    pass
diff --git a/econml/causal_forest.py b/econml/causal_forest.py
index ecbd9e4d..7a3b36e6 100644
--- a/econml/causal_forest.py
+++ b/econml/causal_forest.py
@@ -1,10 +1,12 @@
-from .ortho_forest import DMLOrthoForest
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 from .utilities import LassoCVWrapper, deprecated
 from sklearn.linear_model import LogisticRegressionCV
 from .dml import CausalForestDML
 
 
-@deprecated("The CausalForest class has been deprecated by the CausalForestDML; "
+@deprecated("The CausalForest class has been deprecated by the econml.dml.CausalForestDML; "
             "an upcoming release will remove support for the old class")
 def CausalForest(n_trees=500,
                  min_leaf_size=10,
diff --git a/econml/deepiv.py b/econml/deepiv.py
index 49d3332e..e7a2bbdd 100644
--- a/econml/deepiv.py
+++ b/econml/deepiv.py
@@ -1,461 +1,17 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Deep IV estimator and related components."""
-
-import numpy as np
-import keras
-from ._cate_estimator import BaseCateEstimator
-from .utilities import deprecated
-from keras import backend as K
-import keras.layers as L
-from keras.models import Model
-from econml.utilities import check_input_arrays, _deprecate_positional
-
-# TODO: make sure to use random seeds wherever necessary
-# TODO: make sure that the public API consistently uses "T" instead of "P" for the treatment
-
-# unfortunately with the Theano and Tensorflow backends,
-# the straightforward use of K.stop_gradient can cause an error
-# because the parameters of the intermediate layers are now disconnected from the loss;
-# therefore we add a pointless multiplication by 0 to the values in each of the variables in vs
-# so that those layers remain connected but with 0 gradient
-
-
-def _zero_grad(e, vs):
-    if K.backend() == 'cntk':
-        return K.stop_gradient(e)
-    else:
-        z = 0 * K.sum(K.concatenate([K.batch_flatten(v) for v in vs]))
-        return K.stop_gradient(e) + z
-
-
-def mog_model(n_components, d_x, d_t):
-    """
-    Create a mixture of Gaussians model with the specified number of components.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_x : int
-        The number of dimensions in the layer used as input
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes an input of dimension `d_t` and generates three outputs: pi, mu, and sigma
-
-    """
-    x = L.Input((d_x,))
-    pi = L.Dense(n_components, activation='softmax')(x)
-    mu = L.Reshape((n_components, d_t))(L.Dense(n_components * d_t)(x))
-    log_sig = L.Dense(n_components)(x)
-    sig = L.Lambda(K.exp)(log_sig)
-    return Model([x], [pi, mu, sig])
-
-
-def mog_loss_model(n_components, d_t):
-    """
-    Create a Keras model that computes the loss of a mixture of Gaussians model on data.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes as inputs pi, mu, sigma, and t and generates a single output containing the loss.
-
-    """
-    pi = L.Input((n_components,))
-    mu = L.Input((n_components, d_t))
-    sig = L.Input((n_components,))
-    t = L.Input((d_t,))
-
-    # || t - mu_i || ^2
-    d2 = L.Lambda(lambda d: K.sum(K.square(d), axis=-1),
-                  output_shape=(n_components,))(
-        L.Subtract()([L.RepeatVector(n_components)(t), mu])
-    )
-
-    # LL = C - log(sum(pi_i/sig^d * exp(-d2/(2*sig^2))))
-    # Use logsumexp for numeric stability:
-    # LL = C - log(sum(exp(-d2/(2*sig^2) + log(pi_i/sig^d))))
-    # TODO: does the numeric stability actually make any difference?
-    def make_logloss(d2, sig, pi):
-        return -K.logsumexp(-d2 / (2 * K.square(sig)) + K.log(pi / K.pow(sig, d_t)), axis=-1)
-
-    ll = L.Lambda(lambda dsp: make_logloss(*dsp), output_shape=(1,))([d2, sig, pi])
-
-    m = Model([pi, mu, sig, t], [ll])
-    return m
-
-
-def mog_sample_model(n_components, d_t):
-    """
-    Create a model that generates samples from a mixture of Gaussians.
-
-    Parameters
-    ----------
-    n_components : int
-        The number of components in the mixture model
-
-    d_t : int
-        The number of dimensions in the output
-
-    Returns
-    -------
-    A Keras model that takes as inputs pi, mu, and sigma, and generates a single output containing a sample.
-
-    """
-    pi = L.Input((n_components,))
-    mu = L.Input((n_components, d_t))
-    sig = L.Input((n_components,))
-
-    # CNTK backend can't randomize across batches and doesn't implement cumsum (at least as of June 2018,
-    # see Known Issues on https://docs.microsoft.com/en-us/cognitive-toolkit/Using-CNTK-with-Keras)
-    def sample(pi, mu, sig):
-        batch_size = K.shape(pi)[0]
-        if K.backend() == 'cntk':
-            # generate cumulative sum via matrix multiplication
-            cumsum = K.dot(pi, K.constant(np.triu(np.ones((n_components, n_components)))))
-        else:
-            cumsum = K.cumsum(pi, 1)
-        cumsum_shift = K.concatenate([K.zeros_like(cumsum[:, 0:1]), cumsum])[:, :-1]
-        if K.backend() == 'cntk':
-            import cntk as C
-            # Generate standard uniform values in shape (batch_size,1)
-            #   (since we can't use the dynamic batch_size with random.uniform in CNTK,
-            #    we use uniform_like instead with an input of an appropriate shape)
-            rndSmp = C.random.uniform_like(pi[:, 0:1])
-        else:
-            rndSmp = K.random_uniform((batch_size, 1))
-        cmp1 = K.less_equal(cumsum_shift, rndSmp)
-        cmp2 = K.less(rndSmp, cumsum)
-
-        # convert to floats and multiply to perform equivalent of logical AND
-        rndIndex = K.cast(cmp1, K.floatx()) * K.cast(cmp2, K.floatx())
-
-        if K.backend() == 'cntk':
-            # Generate standard normal values in shape (batch_size,1,d_t)
-            #   (since we can't use the dynamic batch_size with random.normal in CNTK,
-            #    we use normal_like instead with an input of an appropriate shape)
-            rndNorms = C.random.normal_like(mu[:, 0:1, :])  # K.random_normal((1,d_t))
-        else:
-            rndNorms = K.random_normal((batch_size, 1, d_t))
-
-        rndVec = mu + K.expand_dims(sig) * rndNorms
-
-        # exactly one entry should be nonzero for each b,d combination; use sum to select it
-        return K.sum(K.expand_dims(rndIndex) * rndVec, 1)
-
-    # prevent gradient from passing through sampling
-    samp = L.Lambda(lambda pms: _zero_grad(sample(*pms), pms), output_shape=(d_t,))
-    samp.trainable = False
-
-    return Model([pi, mu, sig], samp([pi, mu, sig]))
-
-
-# three options: biased or upper-bound loss require a single number of samples;
-#                unbiased can take different numbers for the network and its gradient
-def response_loss_model(h, p, d_z, d_x, d_y, samples=1, use_upper_bound=False, gradient_samples=0):
-    """
-    Create a Keras model that computes the loss of a response model on data.
-
-    Parameters
-    ----------
-    h : (tensor, tensor) -> Layer
-        Method for building a model of y given p and x
-
-    p : (tensor, tensor) -> Layer
-        Method for building a model of p given z and x
-
-    d_z : int
-        The number of dimensions in z
-
-    d_x :  int
-        Tbe number of dimensions in x
-
-    d_y : int
-        The number of dimensions in y
-
-    samples: int
-        The number of samples to use
-
-    use_upper_bound : bool
-        Whether to use an upper bound to the true loss
-        (equivalent to adding a regularization penalty on the variance of h)
-
-    gradient_samples : int
-        The number of separate additional samples to use when calculating the gradient.
-        This can only be nonzero if user_upper_bound is False, in which case the gradient of
-        the returned loss will be an unbiased estimate of the gradient of the true loss.
-
-    Returns
-    -------
-    A Keras model that takes as inputs z, x, and y and generates a single output containing the loss.
-
-    """
-    assert not(use_upper_bound and gradient_samples)
-
-    # sample: (() -> Layer, int) -> Layer
-    def sample(f, n):
-        assert n > 0
-        if n == 1:
-            return f()
-        else:
-            return L.average([f() for _ in range(n)])
-    z, x, y = [L.Input((d,)) for d in [d_z, d_x, d_y]]
-    if gradient_samples:
-        # we want to separately sample the gradient; we use stop_gradient to treat the sampled model as constant
-        # the overall computation ensures that we have an interpretable loss (y-h̅(p,x))²,
-        # but also that the gradient is -2(y-h̅(p,x))∇h̅(p,x) with *different* samples used for each average
-        diff = L.subtract([y, sample(lambda: h(p(z, x), x), samples)])
-        grad = sample(lambda: h(p(z, x), x), gradient_samples)
-
-        def make_expr(grad, diff):
-            return K.stop_gradient(diff) * (K.stop_gradient(diff + 2 * grad) - 2 * grad)
-        expr = L.Lambda(lambda args: make_expr(*args))([grad, diff])
-    elif use_upper_bound:
-        expr = sample(lambda: L.Lambda(K.square)(L.subtract([y, h(p(z, x), x)])), samples)
-    else:
-        expr = L.Lambda(K.square)(L.subtract([y, sample(lambda: h(p(z, x), x), samples)]))
-    return Model([z, x, y], [expr])
-
-
-class DeepIV(BaseCateEstimator):
-    """
-    The Deep IV Estimator (see http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf).
-
-    Parameters
-    ----------
-    n_components : int
-        Number of components in the mixture density network
-
-    m : (tensor, tensor) -> Layer
-        Method for building a Keras model that featurizes the z and x inputs
-
-    h : (tensor, tensor) -> Layer
-        Method for building a model of y given t and x
-
-    n_samples : int
-        The number of samples to use
-
-    use_upper_bound_loss : bool, optional
-        Whether to use an upper bound to the true loss
-        (equivalent to adding a regularization penalty on the variance of h).
-        Defaults to False.
-
-    n_gradient_samples : int, optional
-        The number of separate additional samples to use when calculating the gradient.
-        This can only be nonzero if user_upper_bound is False, in which case the gradient of
-        the returned loss will be an unbiased estimate of the gradient of the true loss.
-        Defaults to 0.
-
-    optimizer : string, optional
-        The optimizer to use. Defaults to "adam"
-
-    first_stage_options : dictionary, optional
-        The keyword arguments to pass to Keras's `fit` method when training the first stage model.
-        Defaults to `{"epochs": 100}`.
-
-    second_stage_options : dictionary, optional
-        The keyword arguments to pass to Keras's `fit` method when training the second stage model.
-        Defaults to `{"epochs": 100}`.
-
-    """
-
-    def __init__(self, *,
-                 n_components,
-                 m,
-                 h,
-                 n_samples, use_upper_bound_loss=False, n_gradient_samples=0,
-                 optimizer='adam',
-                 first_stage_options={"epochs": 100},
-                 second_stage_options={"epochs": 100}):
-        self._n_components = n_components
-        self._m = m
-        self._h = h
-        self._n_samples = n_samples
-        self._use_upper_bound_loss = use_upper_bound_loss
-        self._n_gradient_samples = n_gradient_samples
-        self._optimizer = optimizer
-        self._first_stage_options = first_stage_options
-        self._second_stage_options = second_stage_options
-        super().__init__()
-
-    @_deprecate_positional("X and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X and Z by position.", ['X', 'Z'])
-    @BaseCateEstimator._wrap_fit
-    def fit(self, Y, T, X, Z, *, inference=None):
-        """Estimate the counterfactual model from data.
-
-        That is, estimate functions τ(·, ·, ·), ∂τ(·, ·).
-
-        Parameters
-        ----------
-        Y: (n × d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n × dₜ) matrix or vector of length n
-            Treatments for each sample
-        X: (n × dₓ) matrix
-            Features for each sample
-        Z: (n × d_z) matrix
-            Instruments for each sample
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`)
-
-        Returns
-        -------
-        self
-
-        """
-        Y, T, X, Z = check_input_arrays(Y, T, X, Z)
-        assert 1 <= np.ndim(X) <= 2
-        assert 1 <= np.ndim(Z) <= 2
-        assert 1 <= np.ndim(T) <= 2
-        assert 1 <= np.ndim(Y) <= 2
-        assert np.shape(X)[0] == np.shape(Y)[0] == np.shape(T)[0] == np.shape(Z)[0]
-
-        # in case vectors were passed for Y or T, keep track of trailing dims for reshaping effect output
-
-        d_x, d_y, d_z, d_t = [np.shape(a)[1] if np.ndim(a) > 1 else 1 for a in [X, Y, Z, T]]
-        x_in, y_in, z_in, t_in = [L.Input((d,)) for d in [d_x, d_y, d_z, d_t]]
-        n_components = self._n_components
-
-        treatment_network = self._m(z_in, x_in)
-
-        # the dimensionality of the output of the network
-        # TODO: is there a more robust way to do this?
-        d_n = K.int_shape(treatment_network)[-1]
-
-        pi, mu, sig = mog_model(n_components, d_n, d_t)([treatment_network])
-
-        ll = mog_loss_model(n_components, d_t)([pi, mu, sig, t_in])
-
-        model = Model([z_in, x_in, t_in], [ll])
-        model.add_loss(L.Lambda(K.mean)(ll))
-        model.compile(self._optimizer)
-        # TODO: do we need to give the user more control over other arguments to fit?
-        model.fit([Z, X, T], [], **self._first_stage_options)
-
-        lm = response_loss_model(lambda t, x: self._h(t, x),
-                                 lambda z, x: Model([z_in, x_in],
-                                                    # subtle point: we need to build a new model each time,
-                                                    # because each model encapsulates its randomness
-                                                    [mog_sample_model(n_components, d_t)([pi, mu, sig])])([z, x]),
-                                 d_z, d_x, d_y,
-                                 self._n_samples, self._use_upper_bound_loss, self._n_gradient_samples)
-
-        rl = lm([z_in, x_in, y_in])
-        response_model = Model([z_in, x_in, y_in], [rl])
-        response_model.add_loss(L.Lambda(K.mean)(rl))
-        response_model.compile(self._optimizer)
-        # TODO: do we need to give the user more control over other arguments to fit?
-        response_model.fit([Z, X, Y], [], **self._second_stage_options)
-
-        self._effect_model = Model([t_in, x_in], [self._h(t_in, x_in)])
-
-        # TODO: it seems like we need to sum over the batch because we can only apply gradient to a scalar,
-        #       not a general tensor (because of how backprop works in every framework)
-        #       (alternatively, we could iterate through the batch in addition to iterating through the output,
-        #       but this seems annoying...)
-        #       Therefore, it's important that we use a batch size of 1 when we call predict with this model
-        def calc_grad(t, x):
-            h = self._h(t, x)
-            all_grads = K.concatenate([g
-                                       for i in range(d_y)
-                                       for g in K.gradients(K.sum(h[:, i]), [t])])
-            return K.reshape(all_grads, (-1, d_y, d_t))
-
-        self._marginal_effect_model = Model([t_in, x_in], L.Lambda(lambda tx: calc_grad(*tx))([t_in, x_in]))
-
-    def effect(self, X=None, T0=0, T1=1):
-        """
-        Calculate the heterogeneous treatment effect τ(·,·,·).
-
-        The effect is calculated between the two treatment points
-        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T0: (m × dₜ) matrix
-            Base treatments for each sample
-        T1: (m × dₜ) matrix
-            Target treatments for each sample
-        X: optional (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        τ: (m × d_y) matrix
-            Heterogeneous treatment effects on each outcome for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-        """
-        X, T0, T1 = check_input_arrays(X, T0, T1)
-        if np.ndim(T0) == 0:
-            T0 = np.repeat(T0, 1 if X is None else np.shape(X)[0])
-        if np.ndim(T1) == 0:
-            T1 = np.repeat(T1, 1 if X is None else np.shape(X)[0])
-        if X is None:
-            X = np.empty((np.shape(T0)[0], 0))
-        return (self._effect_model.predict([T1, X]) - self._effect_model.predict([T0, X])).reshape((-1,) + self._d_y)
-
-    def marginal_effect(self, T, X=None):
-        """
-        Calculate the marginal effect ∂τ(·, ·) around a base treatment point conditional on features.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: optional(m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        grad_tau: (m × d_y × dₜ) array
-            Heterogeneous marginal effects on each outcome for each sample
-            Note that when Y or T is a vector rather than a 2-dimensional array,
-            the corresponding singleton dimensions in the output will be collapsed
-            (e.g. if both are vectors, then the output of this method will also be a vector)
-        """
-        T, X = check_input_arrays(T, X)
-        # TODO: any way to get this to work on batches of arbitrary size?
-        return self._marginal_effect_model.predict([T, X], batch_size=1).reshape((-1,) + self._d_y + self._d_t)
-
-    def predict(self, T, X):
-        """Predict outcomes given treatment assignments and features.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        Y: (m × d_y) matrix
-            Outcomes for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-        """
-        T, X = check_input_arrays(T, X)
-        return self._effect_model.predict([T, X]).reshape((-1,) + self._d_y)
-
-
-@deprecated("The DeepIVEstimator class has been renamed to DeepIV; "
-            "an upcoming release will remove support for the old name")
-class DeepIVEstimator(DeepIV):
-    pass
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.iv.nnet as nnet
+from .utilities import deprecated
+
+
+@deprecated("The econml.deepiv.DeepIV class has renamed to econml.iv.nnet.DeepIV; "
+            "an upcoming release will remove support for the old name")
+class DeepIV(nnet.DeepIV):
+    pass
+
+
+@deprecated("The econml.deepiv.DeepIVEstimator class has been renamed to econml.iv.nnet.DeepIV; "
+            "an upcoming release will remove support for the old name")
+class DeepIVEstimator(nnet.DeepIV):
+    pass
diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py
index d5965626..c19e127e 100644
--- a/econml/dml/causal_forest.py
+++ b/econml/dml/causal_forest.py
@@ -8,7 +8,8 @@ from .dml import _BaseDML
 from .dml import _FirstStageWrapper, _FinalWrapper
 from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper
 from ..sklearn_extensions.model_selection import WeightedStratifiedKFold
-from ..inference import Inference, NormalInferenceResults
+from ..inference import NormalInferenceResults
+from ..inference._inference import Inference
 from sklearn.linear_model import LogisticRegressionCV
 from sklearn.base import clone, BaseEstimator
 from sklearn.preprocessing import FunctionTransformer
@@ -100,7 +101,7 @@ class _GenericSingleOutcomeModelFinalWithCovInference(Inference):
 
 
 class CausalForestDML(_BaseDML):
-    """A Causal Forest [1]_ combined with double machine learning based residualization of the treatment
+    """A Causal Forest [cfdml1]_ combined with double machine learning based residualization of the treatment
     and outcome variables. It fits a forest that solves the local moment equation problem:
 
     .. code-block::
@@ -203,7 +204,7 @@ class CausalForestDML(_BaseDML):
 
             weight(left) * weight(right) || theta(left) - theta(right)||_2^2 / weight(parent)^2
 
-          as outlined in [1]_
+          as outlined in [cfdml1]_
 
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
@@ -370,7 +371,7 @@ class CausalForestDML(_BaseDML):
 
     References
     ----------
-    .. [1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
+    .. [cfdml1] Athey, Susan, Julie Tibshirani, and Stefan Wager. "Generalized random forests."
         The Annals of Statistics 47.2 (2019): 1148-1178
         https://arxiv.org/pdf/1610.01271.pdf
 
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
index ef7ca18d..b3d1c16d 100644
--- a/econml/dml/dml.py
+++ b/econml/dml/dml.py
@@ -23,7 +23,6 @@ from .._cate_estimator import (DebiasedLassoCateEstimatorMixin,
                                StatsModelsCateEstimatorMixin,
                                LinearCateEstimator)
 from ..inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference
-from ..sklearn_extensions.ensemble import SubsampledHonestForest
 from ..sklearn_extensions.linear_model import (MultiOutputDebiasedLasso,
                                                StatsModelsLinearRegression,
                                                WeightedLassoCVWrapper)
@@ -1217,7 +1216,7 @@ def ForestDML(model_y, model_t,
               verbose=0,
               random_state=None):
     """ Instance of NonParamDML with a
-    :class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
+    :class:``~econml.grf.RegressionForest`
     as a final model, so as to enable non-parametric inference.
 
     Parameters
diff --git a/econml/dr/__init__.py b/econml/dr/__init__.py
new file mode 100644
index 00000000..a69bc9bf
--- /dev/null
+++ b/econml/dr/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+
+__all__ = ["DRLearner",
+           "LinearDRLearner",
+           "SparseLinearDRLearner",
+           "ForestDRLearner"]
diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py
new file mode 100644
index 00000000..ffbdc3fd
--- /dev/null
+++ b/econml/dr/_drlearner.py
@@ -0,0 +1,1552 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+Doubly Robust Learner. The method uses the doubly robust correction to construct doubly
+robust estimates of all the potential outcomes of each samples. Then estimates a CATE model
+by regressing the potential outcome differences on the heterogeneity features X.
+
+References
+----------
+
+Dylan Foster, Vasilis Syrgkanis (2019).
+    Orthogonal Statistical Learning.
+    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
+
+Robins, J.M., Rotnitzky, A., and Zhao, L.P. (1994).
+    Estimation of regression coefficients when some regressors are not always observed.
+    Journal of the American Statistical Association 89,846–866.
+
+Bang, H. and Robins, J.M. (2005).
+    Doubly robust estimation in missing data and causal inference models.
+    Biometrics 61,962–972.
+
+Tsiatis AA (2006).
+    Semiparametric Theory and Missing Data.
+    New York: Springer; 2006.
+
+.. testcode::
+    :hide:
+
+    import numpy as np
+    import scipy.special
+    np.set_printoptions(suppress=True)
+
+"""
+
+from warnings import warn
+from copy import deepcopy
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import (LassoCV, LinearRegression,
+                                  LogisticRegressionCV)
+from sklearn.ensemble import RandomForestRegressor
+
+from .._ortho_learner import _OrthoLearner
+from .._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin,
+                               ForestModelFinalCateEstimatorDiscreteMixin,
+                               StatsModelsCateEstimatorDiscreteMixin, LinearCateEstimator)
+from ..inference import GenericModelFinalInferenceDiscrete
+from ..grf import RegressionForest
+from ..sklearn_extensions.linear_model import (
+    DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper)
+from ..utilities import (_deprecate_positional, check_high_dimensional,
+                         filter_none_kwargs, fit_with_groups, inverse_onehot)
+from .._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate
+
+
+class _ModelNuisance:
+    def __init__(self, model_propensity, model_regression, min_propensity):
+        self._model_propensity = model_propensity
+        self._model_regression = model_regression
+        self._min_propensity = min_propensity
+
+    def _combine(self, X, W):
+        return np.hstack([arr for arr in [X, W] if arr is not None])
+
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
+        if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
+            raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
+                             "instead got {1}.".format(len(X), Y.shape))
+        if (X is None) and (W is None):
+            raise AttributeError("At least one of X or W has to not be None!")
+        if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
+            raise AttributeError("Provided crossfit folds contain training splits that " +
+                                 "don't contain all treatments")
+        XW = self._combine(X, W)
+        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)
+
+        fit_with_groups(self._model_propensity, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
+        fit_with_groups(self._model_regression, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
+        return self
+
+    def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
+        XW = self._combine(X, W)
+        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)
+
+        if hasattr(self._model_propensity, 'score'):
+            propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs)
+        else:
+            propensity_score = None
+        if hasattr(self._model_regression, 'score'):
+            regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs)
+        else:
+            regression_score = None
+
+        return propensity_score, regression_score
+
+    def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
+        XW = self._combine(X, W)
+        propensities = np.maximum(self._model_propensity.predict_proba(XW), self._min_propensity)
+        n = T.shape[0]
+        Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
+        T_counter = np.zeros(T.shape)
+        Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
+        Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
+        for t in np.arange(T.shape[1]):
+            T_counter = np.zeros(T.shape)
+            T_counter[:, t] = 1
+            Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
+            Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
+        return Y_pred.reshape(Y.shape + (T.shape[1] + 1,))
+
+
+class _ModelFinal:
+    # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
+    # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want
+    # to allow even for model_final objects whose fit(X, y) can accept X=None
+    # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor
+    # checks that X is 2D array.
+    def __init__(self, model_final, featurizer, multitask_model_final):
+        self._model_final = clone(model_final, safe=False)
+        self._featurizer = clone(featurizer, safe=False)
+        self._multitask_model_final = multitask_model_final
+        return
+
+    def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
+        Y_pred, = nuisances
+        self.d_y = Y_pred.shape[1:-1]  # track whether there's a Y dimension (must be a singleton)
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.fit_transform(X)
+        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var)
+        if self._multitask_model_final:
+            ys = Y_pred[..., 1:] - Y_pred[..., [0]]  # subtract control results from each other arm
+            if self.d_y:  # need to squeeze out singleton so that we fit on 2D array
+                ys = ys.squeeze(1)
+            self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs)
+        else:
+            self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0],
+                                                                         **filtered_kwargs)
+                                for t in np.arange(1, Y_pred.shape[-1])]
+        return self
+
+    def predict(self, X=None):
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+        if self._multitask_model_final:
+            pred = self.model_cate.predict(X)
+            if self.d_y:  # need to reintroduce singleton Y dimension
+                return pred[:, np.newaxis, :]
+            return pred
+        else:
+            preds = np.array([mdl.predict(X).reshape((-1,) + self.d_y) for mdl in self.models_cate])
+            return np.moveaxis(preds, 0, -1)  # move treatment dim to end
+
+    def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+        Y_pred, = nuisances
+        if self._multitask_model_final:
+            return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2,
+                                      weights=sample_weight, axis=0))
+        else:
+            return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] -
+                                        self.models_cate[t - 1].predict(X))**2,
+                                       weights=sample_weight, axis=0)
+                            for t in np.arange(1, Y_pred.shape[-1])])
+
+
+class DRLearner(_OrthoLearner):
+    """
+    CATE estimator that uses doubly-robust correction techniques to account for
+    covariate shift (selection bias) between the treatment arms. The estimator is a special
+    case of an :class:`._OrthoLearner` estimator, so it follows the two
+    stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
+    manner and a final stage estimates the CATE model. See the documentation of
+    :class:`._OrthoLearner` for a description of this two stage process.
+
+    In this estimator, the CATE is estimated by using the following estimating equations. If we let:
+
+    .. math ::
+        Y_{i, t}^{DR} = E[Y | X_i, W_i, T_i=t]\
+            + \\frac{Y_i - E[Y | X_i, W_i, T_i=t]}{Pr[T_i=t | X_i, W_i]} \\cdot 1\\{T_i=t\\}
+
+    Then the following estimating equation holds:
+
+    .. math ::
+        E\\left[Y_{i, t}^{DR} - Y_{i, 0}^{DR} | X_i\\right] = \\theta_t(X_i)
+
+    Thus if we estimate the nuisance functions :math:`h(X, W, T) = E[Y | X, W, T]` and
+    :math:`p_t(X, W)=Pr[T=t | X, W]` in the first stage, we can estimate the final stage cate for each
+    treatment t, by running a regression, regressing :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}` on :math:`X_i`.
+
+    The problem of estimating the nuisance function :math:`p` is a simple multi-class classification
+    problem of predicting the label :math:`T` from :math:`X, W`. The :class:`.DRLearner`
+    class takes as input the parameter ``model_propensity``, which is an arbitrary scikit-learn
+    classifier, that is internally used to solve this classification problem.
+
+    The second nuisance function :math:`h` is a simple regression problem and the :class:`.DRLearner`
+    class takes as input the parameter ``model_regressor``, which is an arbitrary scikit-learn regressor that
+    is internally used to solve this regression problem.
+
+    The final stage is multi-task regression problem with outcomes the labels :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}`
+    for each non-baseline treatment t. The :class:`.DRLearner` takes as input parameter
+    ``model_final``, which is any scikit-learn regressor that is internally used to solve this multi-task
+    regresion problem. If the parameter ``multitask_model_final`` is False, then this model is assumed
+    to be a mono-task regressor, and separate clones of it are used to solve each regression target
+    separately.
+
+    Parameters
+    ----------
+    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
+        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
+        where T is a shape (n, ) array.
+        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
+
+    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
+        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
+        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
+        `predict` methods. If different models per treatment arm are desired, see the
+        :class:`.MultiModelWrapper` helper class.
+        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
+
+    model_final :
+        estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
+        on (features X).
+
+        - If X is None, then the fit method of model_final should be able to handle X=None.
+        - If featurizer is not None and X is not None, then it is trained on the outcome of
+          featurizer.fit_transform(X).
+        - If multitask_model_final is True, then this model must support multitasking
+          and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
+        - The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
+          baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
+          mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
+          clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
+
+    multitask_model_final : bool, optional, default False
+        Whether the model_final should be treated as a multi-task model. See description of model_final.
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    min_propensity : float, optional, default ``1e-6``
+        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional (default is 2)
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    Examples
+    --------
+    A simple example with the default models:
+
+    .. testcode::
+        :hide:
+
+        import numpy as np
+        import scipy.special
+        np.set_printoptions(suppress=True)
+
+    .. testcode::
+
+        from econml.dr import DRLearner
+
+        np.random.seed(123)
+        X = np.random.normal(size=(1000, 3))
+        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
+        sigma = 0.001
+        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
+        est = DRLearner()
+        est.fit(y, T, X=X, W=None)
+
+    >>> est.const_marginal_effect(X[:2])
+    array([[0.511640..., 1.144004...],
+           [0.378140..., 0.613143...]])
+    >>> est.effect(X[:2], T0=0, T1=1)
+    array([0.511640..., 0.378140...])
+    >>> est.score_
+    5.11238581...
+    >>> est.score(y, T, X=X)
+    5.78673506...
+    >>> est.model_cate(T=1).coef_
+    array([0.434910..., 0.010226..., 0.047913...])
+    >>> est.model_cate(T=2).coef_
+    array([ 0.863723...,  0.086946..., -0.022288...])
+    >>> est.cate_feature_names()
+    <BLANKLINE>
+    >>> [mdl.coef_ for mdl in est.models_regression]
+    [array([ 1.472...,  0.001..., -0.011...,  0.698..., 2.049...]),
+     array([ 1.455..., -0.002...,  0.005...,  0.677...,  1.998...])]
+    >>> [mdl.coef_ for mdl in est.models_propensity]
+    [array([[-0.747...,  0.153..., -0.018...],
+           [ 0.083..., -0.110..., -0.076...],
+           [ 0.663..., -0.043... ,  0.094...]]),
+     array([[-1.048...,  0.000...,  0.032...],
+           [ 0.019...,  0.124..., -0.081...],
+           [ 1.029..., -0.124...,  0.049...]])]
+
+    Beyond default models:
+
+    .. testcode::
+
+        from sklearn.linear_model import LassoCV
+        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+        from econml.dr import DRLearner
+
+        np.random.seed(123)
+        X = np.random.normal(size=(1000, 3))
+        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
+        sigma = 0.01
+        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
+        est = DRLearner(model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10),
+                        model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
+                        model_final=LassoCV(cv=3),
+                        featurizer=None)
+        est.fit(y, T, X=X, W=None)
+
+    >>> est.score_
+    1.7...
+    >>> est.const_marginal_effect(X[:3])
+    array([[0.68...,  1.10...],
+           [0.56...,  0.79...],
+           [0.34...,  0.10...]])
+    >>> est.model_cate(T=2).coef_
+    array([0.74..., 0.        , 0.        ])
+    >>> est.model_cate(T=2).intercept_
+    1.9...
+    >>> est.model_cate(T=1).coef_
+    array([0.24..., 0.00..., 0.        ])
+    >>> est.model_cate(T=1).intercept_
+    0.94...
+
+    Attributes
+    ----------
+    score_ : float
+        The MSE in the final doubly robust potential outcome regressions, i.e.
+
+        .. math::
+            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
+
+        where n_t is the number of treatments (excluding control).
+
+        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
+
+
+    """
+
+    def __init__(self, *,
+                 model_propensity='auto',
+                 model_regression='auto',
+                 model_final=StatsModelsLinearRegression(),
+                 multitask_model_final=False,
+                 featurizer=None,
+                 min_propensity=1e-6,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_propensity = clone(model_propensity, safe=False)
+        self.model_regression = clone(model_regression, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.multitask_model_final = multitask_model_final
+        self.featurizer = clone(featurizer, safe=False)
+        self.min_propensity = min_propensity
+        super().__init__(cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_treatment=True,
+                         discrete_instrument=False,  # no instrument, so doesn't matter
+                         categories=categories,
+                         random_state=random_state)
+
+    def _get_inference_options(self):
+        options = super()._get_inference_options()
+        if not self.multitask_model_final:
+            options.update(auto=GenericModelFinalInferenceDiscrete)
+        else:
+            options.update(auto=lambda: None)
+        return options
+
+    def _gen_ortho_learner_model_nuisance(self):
+        if self.model_propensity == 'auto':
+            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
+                                                    random_state=self.random_state)
+        else:
+            model_propensity = clone(self.model_propensity, safe=False)
+
+        if self.model_regression == 'auto':
+            model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state)
+        else:
+            model_regression = clone(self.model_regression, safe=False)
+
+        return _ModelNuisance(model_propensity, model_regression, self.min_propensity)
+
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n,) vector of length n
+            Outcomes for each sample
+        T: (n,) vector of length n
+            Treatments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: DRLearner instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
+        return super().fit(Y, T, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def refit_final(self, *, inference='auto'):
+        return super().refit_final(inference=inference)
+    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
+
+    def score(self, Y, T, X=None, W=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n,) vector of length n
+            Outcomes for each sample
+        T: (n,) vector of length n
+            Treatments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
+        return super().score(Y, T, X=X, W=W)
+
+    @property
+    def multitask_model_cate(self):
+        """
+        Get the fitted final CATE model.
+
+        Returns
+        -------
+        multitask_model_cate: object of type(`model_final`)
+            An instance of the model_final object that was fitted after calling fit which corresponds whose
+            vector of outcomes correspond to the CATE model for each treatment, compared to baseline.
+            Available only when multitask_model_final=True.
+        """
+        if not self.ortho_learner_model_final_._multitask_model_final:
+            raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.")
+        return self.ortho_learner_model_final_.model_cate
+
+    def model_cate(self, T=1):
+        """
+        Get the fitted final CATE model.
+
+        Parameters
+        ----------
+        T: alphanumeric
+            The treatment with respect to which we want the fitted CATE model.
+
+        Returns
+        -------
+        model_cate: object of type(model_final)
+            An instance of the model_final object that was fitted after calling fit which corresponds
+            to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False.
+        """
+        if self.ortho_learner_model_final_._multitask_model_final:
+            raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.")
+        _, T = self._expand_treatments(None, T)
+        ind = inverse_onehot(T).item() - 1
+        assert ind >= 0, "No model was fitted for the control"
+        return self.ortho_learner_model_final_.models_cate[ind]
+
+    @property
+    def models_propensity(self):
+        """
+        Get the fitted propensity models.
+
+        Returns
+        -------
+        models_propensity: list of objects of type(`model_propensity`)
+            A list of instances of the `model_propensity` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_propensity for mdl in super().models_nuisance_]
+
+    @property
+    def models_regression(self):
+        """
+        Get the fitted regression models.
+
+        Returns
+        -------
+        model_regression: list of objects of type(`model_regression`)
+            A list of instances of the model_regression object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_regression for mdl in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_propensity(self):
+        """Gets the score for the propensity model on out-of-sample training data"""
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_regression(self):
+        """Gets the score for the regression model on out-of-sample training data"""
+        return self.nuisance_scores_[1]
+
+    @property
+    def featurizer_(self):
+        """
+        Get the fitted featurizer.
+
+        Returns
+        -------
+        featurizer: object of type(`featurizer`)
+            An instance of the fitted featurizer that was used to preprocess X in the final CATE model training.
+            Available only when featurizer is not None and X is not None.
+        """
+        return self.ortho_learner_model_final_._featurizer
+
+    def cate_feature_names(self, feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        feature_names: list of strings of length X.shape[1] or None
+            The names of the input features. If None and X is a dataframe, it defaults to the column names
+            from the dataframe.
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final CATE model for each treatment is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Available only when the featurizer is not None and has
+            a method: `get_feature_names(feature_names)`. Otherwise None is returned.
+        """
+        if self._d_x is None:
+            # Handles the corner case when X=None but featurizer might be not None
+            return None
+        if feature_names is None:
+            feature_names = self._input_names["feature_names"]
+        if self.featurizer_ is None:
+            return feature_names
+        elif hasattr(self.featurizer_, 'get_feature_names'):
+            # This fails if X=None and featurizer is not None, but that case is handled above
+            return self.featurizer_.get_feature_names(feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+    @property
+    def model_final_(self):
+        return self.ortho_learner_model_final_._model_final
+
+    @property
+    def fitted_models_final(self):
+        return self.ortho_learner_model_final_.models_cate
+
+    def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
+        if self.featurizer_ is not None:
+            F = self.featurizer_.transform(X)
+        else:
+            F = X
+        feature_names = self.cate_feature_names(feature_names)
+
+        if self.ortho_learner_model_final_._multitask_model_final:
+            return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, F,
+                                                      self._d_t, self._d_y,
+                                                      feature_names=feature_names,
+                                                      treatment_names=treatment_names,
+                                                      output_names=output_names,
+                                                      input_names=self._input_names,
+                                                      background_samples=background_samples)
+        else:
+            return _shap_explain_model_cate(self.const_marginal_effect, self.fitted_models_final,
+                                            F, self._d_t, self._d_y,
+                                            feature_names=feature_names,
+                                            treatment_names=treatment_names,
+                                            output_names=output_names,
+                                            input_names=self._input_names,
+                                            background_samples=background_samples)
+    shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__
+
+
+class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
+    """
+    Special case of the :class:`.DRLearner` where the final stage
+    is a Linear Regression on a low dimensional set of features. In this case, inference
+    can be performed via the asymptotic normal characterization of the estimated parameters.
+    This is computationally faster than bootstrap inference. To do this, just leave the setting ``inference='auto'``
+    unchanged, or explicitly set ``inference='statsmodels'`` or alter the covariance type calculation via
+    ``inference=StatsModelsInferenceDiscrete(cov_type='HC1)``.
+
+    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:
+
+    .. math ::
+        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t
+
+    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
+    is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
+    running a standard ordinary linear regression (OLS), regressing the doubly robust outcome differences on X:
+
+    .. math ::
+        \\min_{\\theta_t, \\beta_t}\
+        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
+            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]
+
+    Then inference can be performed via standard approaches for inference of OLS, via asympotic normal approximations
+    of the estimated parameters. The default covariance estimator used is heteroskedasticity robust (HC1).
+    For other methods see :class:`.StatsModelsInferenceDiscrete`. Use can invoke them by setting:
+    ``inference=StatsModelsInferenceDiscrete(cov_type=...)``.
+
+    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
+    inference on the best linear approximation of the CATE model.
+
+    Parameters
+    ----------
+    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
+        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
+        where T is a shape (n, ) array.
+        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
+
+    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
+        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
+        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
+        `predict` methods. If different models per treatment arm are desired, see the
+        :class:`.MultiModelWrapper` helper class.
+        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    min_propensity : float, optional, default ``1e-6``
+        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional (default is 2)
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(X,T)` to generate the splits.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    Examples
+    --------
+    A simple example with the default models:
+
+    .. testcode::
+        :hide:
+
+        import numpy as np
+        import scipy.special
+        np.set_printoptions(suppress=True)
+
+    .. testcode::
+
+        from econml.dr import DRLearner, LinearDRLearner
+
+        np.random.seed(123)
+        X = np.random.normal(size=(1000, 3))
+        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
+        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
+        est = LinearDRLearner()
+        est.fit(y, T, X=X, W=None)
+
+    >>> est.effect(X[:3])
+    array([ 0.409743...,  0.312604..., -0.127394...])
+    >>> est.effect_interval(X[:3])
+    (array([ 0.120682..., -0.102543..., -0.663246...]), array([0.698803..., 0.727753..., 0.408458...]))
+    >>> est.coef_(T=1)
+    array([ 0.450779..., -0.003214... ,  0.063884... ])
+    >>> est.coef__interval(T=1)
+    (array([ 0.202646..., -0.207195..., -0.104558...]), array([0.698911..., 0.200767..., 0.232326...]))
+    >>> est.intercept_(T=1)
+    0.88425066...
+    >>> est.intercept__interval(T=1)
+    (0.68655813..., 1.08194320...)
+
+    Attributes
+    ----------
+    score_ : float
+        The MSE in the final doubly robust potential outcome regressions, i.e.
+
+        .. math::
+            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
+
+        where n_t is the number of treatments (excluding control).
+
+        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
+
+    """
+
+    def __init__(self, *,
+                 model_propensity='auto',
+                 model_regression='auto',
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 min_propensity=1e-6,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(model_propensity=model_propensity,
+                         model_regression=model_regression,
+                         model_final=None,
+                         featurizer=featurizer,
+                         multitask_model_final=False,
+                         min_propensity=min_propensity,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n,) vector of length n
+            Outcomes for each sample
+        T: (n,) vector of length n
+            Treatments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports ``'bootstrap'``
+            (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'``
+            (or an instance of :class:`.StatsModelsInferenceDiscrete`).
+
+        Returns
+        -------
+        self: DRLearner instance
+        """
+        # Replacing fit from DRLearner, to add statsmodels inference in docstring
+        return super().fit(Y, T, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.model_final_.fit_intercept
+
+    @property
+    def multitask_model_cate(self):
+        # Replacing this method which is invalid for this class, so that we make the
+        # dosctring empty and not appear in the docs.
+        return super().multitask_model_cate
+
+    @property
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
+
+
+class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
+    """
+    Special case of the :class:`.DRLearner` where the final stage
+    is a Debiased Lasso Regression. In this case, inference can be performed via the debiased lasso approach
+    and its asymptotic normal characterization of the estimated parameters. This is computationally
+    faster than bootstrap inference. Leave the default ``inference='auto'`` unchanged, or explicitly set
+    ``inference='debiasedlasso'`` at fit time to enable inference via asymptotic normality.
+
+    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:
+
+    .. math ::
+        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t
+
+    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
+    is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
+    running a debiased lasso regression (i.e. :math:`\\ell_1`-penalized regression with debiasing),
+    regressing the doubly robust outcome differences on X: i.e. first solves the penalized square loss problem
+
+    .. math ::
+        \\min_{\\theta_t, \\beta_t}\
+        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
+            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]\
+                + \\lambda \\left\\lVert \\theta_t \\right\\rVert_1
+
+    and then adds a debiasing correction to the solution. If alpha='auto' (recommended), then the penalty
+    weight :math:`\\lambda` is set optimally via cross-validation.
+
+    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
+    inference on the best sparse linear approximation of the CATE model.
+
+    Parameters
+    ----------
+    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
+        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
+        where T is a shape (n, ) array.
+        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
+
+    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
+        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
+        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
+        `predict` methods. If different models per treatment arm are desired, see the
+        :class:`.MultiModelWrapper` helper class.
+        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    alpha: string | float, optional., default 'auto'.
+        CATE L1 regularization applied through the debiased lasso in the final model.
+        'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
+
+    n_alphas : int, optional, default 100
+        How many alphas to try if alpha='auto'
+
+    alpha_cov : string | float, optional, default 'auto'
+        The regularization alpha that is used when constructing the pseudo inverse of
+        the covariance matrix Theta used to for correcting the final state lasso coefficient
+        in the debiased lasso. Each such regression corresponds to the regression of one feature
+        on the remainder of the features.
+
+    n_alphas_cov : int, optional, default 10
+        How many alpha_cov to try if alpha_cov='auto'.
+
+    max_iter : int, optional, default 1000
+        The maximum number of iterations in the Debiased Lasso
+
+    tol : float, optional, default 1e-4
+        The tolerance for the optimization: if the updates are
+        smaller than ``tol``, the optimization code checks the
+        dual gap for optimality and continues until it is smaller
+        than ``tol``.
+
+    n_jobs : int or None, optional (default=None)
+        The number of jobs to run in parallel for both `fit` and `predict`.
+        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
+        ``-1`` means using all processors.
+
+    min_propensity : float, optional, default ``1e-6``
+        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(X,T)` to generate the splits.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    Examples
+    --------
+    A simple example with the default models:
+
+    .. testcode::
+        :hide:
+
+        import numpy as np
+        import scipy.special
+        np.set_printoptions(suppress=True)
+
+    .. testcode::
+
+        from econml.dr import DRLearner, SparseLinearDRLearner
+
+        np.random.seed(123)
+        X = np.random.normal(size=(1000, 3))
+        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
+        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
+        est = SparseLinearDRLearner()
+        est.fit(y, T, X=X, W=None)
+
+    >>> est.effect(X[:3])
+    array([ 0.41...,  0.31..., -0.12...])
+    >>> est.effect_interval(X[:3])
+    (array([ 0.04..., -0.19..., -0.73...]), array([0.77..., 0.82..., 0.47...]))
+    >>> est.coef_(T=1)
+    array([ 0.45..., -0.00..., 0.06...])
+    >>> est.coef__interval(T=1)
+    (array([ 0.24... , -0.19..., -0.13...]), array([0.65..., 0.19..., 0.26...]))
+    >>> est.intercept_(T=1)
+    0.88...
+    >>> est.intercept__interval(T=1)
+    (0.68..., 1.08...)
+
+    Attributes
+    ----------
+    score_ : float
+        The MSE in the final doubly robust potential outcome regressions, i.e.
+
+        .. math::
+            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
+
+        where n_t is the number of treatments (excluding control).
+
+        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
+
+    """
+
+    def __init__(self, *,
+                 model_propensity='auto',
+                 model_regression='auto',
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 alpha='auto',
+                 n_alphas=100,
+                 alpha_cov='auto',
+                 n_alphas_cov=10,
+                 max_iter=1000,
+                 tol=1e-4,
+                 n_jobs=None,
+                 min_propensity=1e-6,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.fit_cate_intercept = fit_cate_intercept
+        self.alpha = alpha
+        self.n_alphas = n_alphas
+        self.alpha_cov = alpha_cov
+        self.n_alphas_cov = n_alphas_cov
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_jobs = n_jobs
+        super().__init__(model_propensity=model_propensity,
+                         model_regression=model_regression,
+                         model_final=None,
+                         featurizer=featurizer,
+                         multitask_model_final=False,
+                         min_propensity=min_propensity,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return DebiasedLasso(alpha=self.alpha,
+                             n_alphas=self.n_alphas,
+                             alpha_cov=self.alpha_cov,
+                             n_alphas_cov=self.n_alphas_cov,
+                             fit_intercept=self.fit_cate_intercept,
+                             max_iter=self.max_iter,
+                             tol=self.tol,
+                             n_jobs=self.n_jobs,
+                             random_state=self.random_state)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n,) vector of length n
+            Outcomes for each sample
+        T: (n,) vector of length n
+            Treatments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports ``'bootstrap'``
+            (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
+            (or an instance of :class:`.LinearModelInferenceDiscrete`).
+
+        Returns
+        -------
+        self: DRLearner instance
+        """
+        # Replacing fit from DRLearner, to add debiasedlasso inference in docstring
+        # TODO: support sample_var
+        if sample_weight is not None and inference is not None:
+            warn("This estimator does not yet support sample variances and inference does not take "
+                 "sample variances into account. This feature will be supported in a future release.")
+        check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
+                               discrete_treatment=self.discrete_treatment,
+                               msg="The number of features in the final model (< 5) is too small for a sparse model. "
+                               "We recommend using the LinearDRLearner for this low-dimensional setting.")
+        return super().fit(Y, T, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=None, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.model_final_.fit_intercept
+
+    @property
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
+
+
+class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
+    """ Instance of DRLearner with a :class:`~econml.grf.RegressionForest`
+    as a final model, so as to enable non-parametric inference.
+
+    Parameters
+    ----------
+    model_propensity : scikit-learn classifier
+        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
+        where T is a shape (n, ) array.
+
+    model_regression : scikit-learn regressor
+        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
+        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
+        `predict` methods. If different models per treatment arm are desired, see the
+        :class:`~econml.utilities.MultiModelWrapper` helper class.
+
+    min_propensity : float, optional, default ``1e-6``
+        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional (Default=2)
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    n_crossfit_splits: int or 'raise', optional (default='raise')
+        Deprecated by parameter `cv` and will be removed in next version. Can be used
+        interchangeably with `cv`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    n_estimators : integer, optional (default=100)
+        The total number of trees in the forest. The forest consists of a
+        forest of sqrt(n_estimators) sub-forests, where each sub-forest
+        contains sqrt(n_estimators) trees.
+
+    criterion : string, optional (default="mse")
+        The function to measure the quality of a split. Supported criteria
+        are "mse" for the mean squared error, which is equal to variance
+        reduction as feature selection criterion, and "mae" for the mean
+        absolute error.
+
+    max_depth : integer or None, optional (default=None)
+        The maximum depth of the tree. If None, then nodes are expanded until
+        all leaves are pure or until all leaves contain less than
+        min_samples_split samples.
+
+    min_samples_split : int, float, optional (default=2)
+        The minimum number of splitting samples required to split an internal node.
+
+        - If int, then consider `min_samples_split` as the minimum number.
+        - If float, then `min_samples_split` is a fraction and
+          `ceil(min_samples_split * n_samples)` are the minimum
+          number of samples for each split.
+
+    min_samples_leaf : int, float, optional (default=1)
+        The minimum number of samples required to be at a leaf node.
+        A split point at any depth will only be considered if it leaves at
+        least ``min_samples_leaf`` splitting samples in each of the left and
+        right branches.  This may have the effect of smoothing the model,
+        especially in regression. After construction the tree is also pruned
+        so that there are at least min_samples_leaf estimation samples on
+        each leaf.
+
+        - If int, then consider `min_samples_leaf` as the minimum number.
+        - If float, then `min_samples_leaf` is a fraction and
+          `ceil(min_samples_leaf * n_samples)` are the minimum
+          number of samples for each node.
+
+    min_weight_fraction_leaf : float, optional (default=0.)
+        The minimum weighted fraction of the sum total of weights (of all
+        splitting samples) required to be at a leaf node. Samples have
+        equal weight when sample_weight is not provided. After construction
+        the tree is pruned so that the fraction of the sum total weight
+        of the estimation samples contained in each leaf node is at
+        least min_weight_fraction_leaf
+
+    max_features : int, float, string or None, optional (default="auto")
+        The number of features to consider when looking for the best split:
+
+        - If int, then consider `max_features` features at each split.
+        - If float, then `max_features` is a fraction and
+          `int(max_features * n_features)` features are considered at each
+          split.
+        - If "auto", then `max_features=n_features`.
+        - If "sqrt", then `max_features=sqrt(n_features)`.
+        - If "log2", then `max_features=log2(n_features)`.
+        - If None, then `max_features=n_features`.
+
+        Note: the search for a split does not stop until at least one
+        valid partition of the node samples is found, even if it requires to
+        effectively inspect more than ``max_features`` features.
+
+    max_leaf_nodes : int or None, optional (default=None)
+        Grow trees with ``max_leaf_nodes`` in best-first fashion.
+        Best nodes are defined as relative reduction in impurity.
+        If None then unlimited number of leaf nodes.
+
+    min_impurity_decrease : float, optional (default=0.)
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
+        The weighted impurity decrease equation is the following::
+
+            N_t / N * (impurity - N_t_R / N_t * right_impurity
+                                - N_t_L / N_t * left_impurity)
+
+        where ``N`` is the total number of split samples, ``N_t`` is the number of
+        split samples at the current node, ``N_t_L`` is the number of split samples in the
+        left child, and ``N_t_R`` is the number of split samples in the right child.
+
+        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
+        if ``sample_weight`` is passed.
+
+    subsample_fr : float or 'auto', optional (default='auto')
+        The fraction of the half-samples that are used on each tree. Each tree
+        will be built on subsample_fr * n_samples/2.
+
+        If 'auto', then the subsampling fraction is set to::
+
+            (n_samples/2)**(1-1/(2*n_features+2))/(n_samples/2)
+
+        which is sufficient to guarantee asympotitcally valid inference.
+
+    honest : boolean, optional (default=True)
+        Whether to use honest trees, i.e. half of the samples are used for
+        creating the tree structure and the other half for the estimation at
+        the leafs. If False, then all samples are used for both parts.
+
+    n_jobs : int or None, optional (default=None)
+        The number of jobs to run in parallel for both `fit` and `predict`.
+        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, optional (default=0)
+        Controls the verbosity when fitting and predicting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_regression="auto",
+                 model_propensity="auto",
+                 featurizer=None,
+                 min_propensity=1e-6,
+                 categories='auto',
+                 cv=2,
+                 n_crossfit_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 n_estimators=1000,
+                 criterion='deprecated',
+                 max_depth=None,
+                 min_samples_split=5,
+                 min_samples_leaf=5,
+                 min_weight_fraction_leaf=0.,
+                 max_features="auto",
+                 max_leaf_nodes='deprecated',
+                 min_impurity_decrease=0.,
+                 subsample_fr='deprecated',
+                 max_samples=.45,
+                 min_balancedness_tol=.45,
+                 honest=True,
+                 subforest_size=4,
+                 n_jobs=-1,
+                 verbose=0,
+                 random_state=None):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.min_impurity_decrease = min_impurity_decrease
+        self.max_samples = max_samples
+        self.min_balancedness_tol = min_balancedness_tol
+        self.honest = honest
+        self.subforest_size = subforest_size
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.n_crossfit_splits = n_crossfit_splits
+        if self.n_crossfit_splits != 'raise':
+            cv = self.n_crossfit_splits
+        self.subsample_fr = subsample_fr
+        self.max_leaf_nodes = max_leaf_nodes
+        self.criterion = criterion
+        super().__init__(model_regression=model_regression,
+                         model_propensity=model_propensity,
+                         model_final=None,
+                         featurizer=featurizer,
+                         multitask_model_final=False,
+                         min_propensity=min_propensity,
+                         categories=categories,
+                         cv=cv,
+                         n_splits='raise',
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return RegressionForest(n_estimators=self.n_estimators,
+                                max_depth=self.max_depth,
+                                min_samples_split=self.min_samples_split,
+                                min_samples_leaf=self.min_samples_leaf,
+                                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                                max_features=self.max_features,
+                                min_impurity_decrease=self.min_impurity_decrease,
+                                max_samples=self.max_samples,
+                                min_balancedness_tol=self.min_balancedness_tol,
+                                honest=self.honest,
+                                inference=True,
+                                subforest_size=self.subforest_size,
+                                n_jobs=self.n_jobs,
+                                random_state=self.random_state,
+                                verbose=self.verbose,
+                                warm_start=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n × dₜ) matrix or vector of length n
+            Treatments for each sample
+        X: optional (n × dₓ) matrix
+            Features for each sample
+        W: optional (n × d_w) matrix
+            Controls for each sample
+        sample_weight: optional (n,) vector
+            Weights for each row
+        sample_var: optional (n, n_y) vector
+            Variance of sample, in case it corresponds to summary of many samples. Currently
+            not in use by this method (as inference method does not require sample variance info).
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string, `Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`) and 'blb'
+            (for Bootstrap-of-Little-Bags based inference)
+
+        Returns
+        -------
+        self
+        """
+        return super().fit(Y, T, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=None, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def multitask_model_cate(self):
+        # Replacing to remove docstring
+        super().multitask_model_cate()
+
+    @property
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
+
+    ####################################################################
+    # Everything below should be removed once parameters are deprecated
+    ####################################################################
+
+    @property
+    def n_crossfit_splits(self):
+        return self.cv
+
+    @n_crossfit_splits.setter
+    def n_crossfit_splits(self, value):
+        if value != 'raise':
+            warn("Deprecated by parameter `n_splits` and will be removed in next version.")
+        self.cv = value
+
+    @property
+    def criterion(self):
+        return self.criterion
+
+    @criterion.setter
+    def criterion(self, value):
+        if value != 'deprecated':
+            warn("The parameter 'criterion' has been deprecated and will be removed in the next version. "
+                 "Only the 'mse' criterion is supported.")
+
+    @property
+    def max_leaf_nodes(self):
+        return self.max_leaf_nodes
+
+    @max_leaf_nodes.setter
+    def max_leaf_nodes(self, value):
+        if value != 'deprecated':
+            warn("The parameter 'max_leaf_nodes' has been deprecated and will be removed in the next version.")
+
+    @property
+    def subsample_fr(self):
+        return 2 * self.max_samples
+
+    @subsample_fr.setter
+    def subsample_fr(self, value):
+        if value != 'deprecated':
+            warn("The parameter 'subsample_fr' has been deprecated and will be removed in the next version. "
+                 "Use 'max_samples' instead, with the convention that "
+                 "'subsample_fr=x' is equivalent to 'max_samples=x/2'.")
+            self.max_samples = .45 if value == 'auto' else value / 2
diff --git a/econml/drlearner.py b/econml/drlearner.py
index 1eedbc78..37942426 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -1,1537 +1,29 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-"""
-Doubly Robust Learner. The method uses the doubly robust correction to construct doubly
-robust estimates of all the potential outcomes of each samples. Then estimates a CATE model
-by regressing the potential outcome differences on the heterogeneity features X.
+import econml.dr as dr
+from .utilities import deprecated
 
-References
-----------
 
-Dylan Foster, Vasilis Syrgkanis (2019).
-    Orthogonal Statistical Learning.
-    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
+@deprecated("The econml.drlearner.DRLearner class has been moved to econml.dr.DRLearner; "
+            "an upcoming release will remove support for the old name")
+class DRLearner(dr.DRLearner):
+    pass
 
-Robins, J.M., Rotnitzky, A., and Zhao, L.P. (1994).
-    Estimation of regression coefficients when some regressors are not always observed.
-    Journal of the American Statistical Association 89,846–866.
 
-Bang, H. and Robins, J.M. (2005).
-    Doubly robust estimation in missing data and causal inference models.
-    Biometrics 61,962–972.
+@deprecated("The econml.drlearner.LinearDRLearner class has been moved to econml.dr.LinearDRLearner; "
+            "an upcoming release will remove support for the old name")
+class LinearDRLearner(dr.LinearDRLearner):
+    pass
 
-Tsiatis AA (2006).
-    Semiparametric Theory and Missing Data.
-    New York: Springer; 2006.
 
-.. testcode::
-    :hide:
+@deprecated("The econml.drlearner.SparseLinearDRLearner class has been moved to econml.dr.SparseLinearDRLearner; "
+            "an upcoming release will remove support for the old name")
+class SparseLinearDRLearner(dr.SparseLinearDRLearner):
+    pass
 
-    import numpy as np
-    import scipy.special
-    np.set_printoptions(suppress=True)
 
-"""
-
-from warnings import warn
-from copy import deepcopy
-
-import numpy as np
-from sklearn.base import clone
-from sklearn.linear_model import (LassoCV, LinearRegression,
-                                  LogisticRegressionCV)
-from sklearn.ensemble import RandomForestRegressor
-
-from ._ortho_learner import _OrthoLearner
-from ._cate_estimator import (DebiasedLassoCateEstimatorDiscreteMixin,
-                              ForestModelFinalCateEstimatorDiscreteMixin,
-                              StatsModelsCateEstimatorDiscreteMixin, LinearCateEstimator)
-from .inference import GenericModelFinalInferenceDiscrete
-from .grf import RegressionForest
-from .sklearn_extensions.linear_model import (
-    DebiasedLasso, StatsModelsLinearRegression, WeightedLassoCVWrapper)
-from .utilities import (_deprecate_positional, check_high_dimensional,
-                        filter_none_kwargs, fit_with_groups, inverse_onehot)
-from ._shap import _shap_explain_multitask_model_cate, _shap_explain_model_cate
-
-
-class _ModelNuisance:
-    def __init__(self, model_propensity, model_regression, min_propensity):
-        self._model_propensity = model_propensity
-        self._model_regression = model_regression
-        self._min_propensity = min_propensity
-
-    def _combine(self, X, W):
-        return np.hstack([arr for arr in [X, W] if arr is not None])
-
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
-        if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
-            raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
-                             "instead got {1}.".format(len(X), Y.shape))
-        if (X is None) and (W is None):
-            raise AttributeError("At least one of X or W has to not be None!")
-        if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
-            raise AttributeError("Provided crossfit folds contain training splits that " +
-                                 "don't contain all treatments")
-        XW = self._combine(X, W)
-        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)
-
-        fit_with_groups(self._model_propensity, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
-        fit_with_groups(self._model_regression, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
-        return self
-
-    def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
-        XW = self._combine(X, W)
-        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)
-
-        if hasattr(self._model_propensity, 'score'):
-            propensity_score = self._model_propensity.score(XW, inverse_onehot(T), **filtered_kwargs)
-        else:
-            propensity_score = None
-        if hasattr(self._model_regression, 'score'):
-            regression_score = self._model_regression.score(np.hstack([XW, T]), Y, **filtered_kwargs)
-        else:
-            regression_score = None
-
-        return propensity_score, regression_score
-
-    def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
-        XW = self._combine(X, W)
-        propensities = np.maximum(self._model_propensity.predict_proba(XW), self._min_propensity)
-        n = T.shape[0]
-        Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
-        T_counter = np.zeros(T.shape)
-        Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
-        Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
-        for t in np.arange(T.shape[1]):
-            T_counter = np.zeros(T.shape)
-            T_counter[:, t] = 1
-            Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
-            Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
-        return Y_pred.reshape(Y.shape + (T.shape[1] + 1,))
-
-
-class _ModelFinal:
-    # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
-    # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want
-    # to allow even for model_final objects whose fit(X, y) can accept X=None
-    # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor
-    # checks that X is 2D array.
-    def __init__(self, model_final, featurizer, multitask_model_final):
-        self._model_final = clone(model_final, safe=False)
-        self._featurizer = clone(featurizer, safe=False)
-        self._multitask_model_final = multitask_model_final
-        return
-
-    def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
-        Y_pred, = nuisances
-        self.d_y = Y_pred.shape[1:-1]  # track whether there's a Y dimension (must be a singleton)
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.fit_transform(X)
-        filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var)
-        if self._multitask_model_final:
-            ys = Y_pred[..., 1:] - Y_pred[..., [0]]  # subtract control results from each other arm
-            if self.d_y:  # need to squeeze out singleton so that we fit on 2D array
-                ys = ys.squeeze(1)
-            self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs)
-        else:
-            self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0],
-                                                                         **filtered_kwargs)
-                                for t in np.arange(1, Y_pred.shape[-1])]
-        return self
-
-    def predict(self, X=None):
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.transform(X)
-        if self._multitask_model_final:
-            pred = self.model_cate.predict(X)
-            if self.d_y:  # need to reintroduce singleton Y dimension
-                return pred[:, np.newaxis, :]
-            return pred
-        else:
-            preds = np.array([mdl.predict(X).reshape((-1,) + self.d_y) for mdl in self.models_cate])
-            return np.moveaxis(preds, 0, -1)  # move treatment dim to end
-
-    def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.transform(X)
-        Y_pred, = nuisances
-        if self._multitask_model_final:
-            return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2,
-                                      weights=sample_weight, axis=0))
-        else:
-            return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] -
-                                        self.models_cate[t - 1].predict(X))**2,
-                                       weights=sample_weight, axis=0)
-                            for t in np.arange(1, Y_pred.shape[-1])])
-
-
-class DRLearner(_OrthoLearner):
-    """
-    CATE estimator that uses doubly-robust correction techniques to account for
-    covariate shift (selection bias) between the treatment arms. The estimator is a special
-    case of an :class:`._OrthoLearner` estimator, so it follows the two
-    stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
-    manner and a final stage estimates the CATE model. See the documentation of
-    :class:`._OrthoLearner` for a description of this two stage process.
-
-    In this estimator, the CATE is estimated by using the following estimating equations. If we let:
-
-    .. math ::
-        Y_{i, t}^{DR} = E[Y | X_i, W_i, T_i=t]\
-            + \\frac{Y_i - E[Y | X_i, W_i, T_i=t]}{Pr[T_i=t | X_i, W_i]} \\cdot 1\\{T_i=t\\}
-
-    Then the following estimating equation holds:
-
-    .. math ::
-        E\\left[Y_{i, t}^{DR} - Y_{i, 0}^{DR} | X_i\\right] = \\theta_t(X_i)
-
-    Thus if we estimate the nuisance functions :math:`h(X, W, T) = E[Y | X, W, T]` and
-    :math:`p_t(X, W)=Pr[T=t | X, W]` in the first stage, we can estimate the final stage cate for each
-    treatment t, by running a regression, regressing :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}` on :math:`X_i`.
-
-    The problem of estimating the nuisance function :math:`p` is a simple multi-class classification
-    problem of predicting the label :math:`T` from :math:`X, W`. The :class:`.DRLearner`
-    class takes as input the parameter ``model_propensity``, which is an arbitrary scikit-learn
-    classifier, that is internally used to solve this classification problem.
-
-    The second nuisance function :math:`h` is a simple regression problem and the :class:`.DRLearner`
-    class takes as input the parameter ``model_regressor``, which is an arbitrary scikit-learn regressor that
-    is internally used to solve this regression problem.
-
-    The final stage is multi-task regression problem with outcomes the labels :math:`Y_{i, t}^{DR} - Y_{i, 0}^{DR}`
-    for each non-baseline treatment t. The :class:`.DRLearner` takes as input parameter
-    ``model_final``, which is any scikit-learn regressor that is internally used to solve this multi-task
-    regresion problem. If the parameter ``multitask_model_final`` is False, then this model is assumed
-    to be a mono-task regressor, and separate clones of it are used to solve each regression target
-    separately.
-
-    Parameters
-    ----------
-    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
-        where T is a shape (n, ) array.
-        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
-
-    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
-        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
-        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
-        `predict` methods. If different models per treatment arm are desired, see the
-        :class:`.MultiModelWrapper` helper class.
-        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
-
-    model_final :
-        estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
-        on (features X).
-
-        - If X is None, then the fit method of model_final should be able to handle X=None.
-        - If featurizer is not None and X is not None, then it is trained on the outcome of
-          featurizer.fit_transform(X).
-        - If multitask_model_final is True, then this model must support multitasking
-          and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
-        - The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
-          baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
-          mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
-          clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
-
-    multitask_model_final : bool, optional, default False
-        Whether the model_final should be treated as a multi-task model. See description of model_final.
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    min_propensity : float, optional, default ``1e-6``
-        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional (default is 2)
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    Examples
-    --------
-    A simple example with the default models:
-
-    .. testcode::
-
-        from econml.drlearner import DRLearner
-
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
-        sigma = 0.001
-        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
-        est = DRLearner()
-        est.fit(y, T, X=X, W=None)
-
-    >>> est.const_marginal_effect(X[:2])
-    array([[0.511640..., 1.144004...],
-           [0.378140..., 0.613143...]])
-    >>> est.effect(X[:2], T0=0, T1=1)
-    array([0.511640..., 0.378140...])
-    >>> est.score_
-    5.11238581...
-    >>> est.score(y, T, X=X)
-    5.78673506...
-    >>> est.model_cate(T=1).coef_
-    array([0.434910..., 0.010226..., 0.047913...])
-    >>> est.model_cate(T=2).coef_
-    array([ 0.863723...,  0.086946..., -0.022288...])
-    >>> est.cate_feature_names()
-    <BLANKLINE>
-    >>> [mdl.coef_ for mdl in est.models_regression]
-    [array([ 1.472...,  0.001..., -0.011...,  0.698..., 2.049...]),
-     array([ 1.455..., -0.002...,  0.005...,  0.677...,  1.998...])]
-    >>> [mdl.coef_ for mdl in est.models_propensity]
-    [array([[-0.747...,  0.153..., -0.018...],
-           [ 0.083..., -0.110..., -0.076...],
-           [ 0.663..., -0.043... ,  0.094...]]),
-     array([[-1.048...,  0.000...,  0.032...],
-           [ 0.019...,  0.124..., -0.081...],
-           [ 1.029..., -0.124...,  0.049...]])]
-
-    Beyond default models:
-
-    .. testcode::
-
-        from sklearn.linear_model import LassoCV
-        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-        from econml.drlearner import DRLearner
-
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
-        sigma = 0.01
-        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
-        est = DRLearner(model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10),
-                        model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
-                        model_final=LassoCV(cv=3),
-                        featurizer=None)
-        est.fit(y, T, X=X, W=None)
-
-    >>> est.score_
-    1.7...
-    >>> est.const_marginal_effect(X[:3])
-    array([[0.68...,  1.10...],
-           [0.56...,  0.79...],
-           [0.34...,  0.10...]])
-    >>> est.model_cate(T=2).coef_
-    array([0.74..., 0.        , 0.        ])
-    >>> est.model_cate(T=2).intercept_
-    1.9...
-    >>> est.model_cate(T=1).coef_
-    array([0.24..., 0.00..., 0.        ])
-    >>> est.model_cate(T=1).intercept_
-    0.94...
-
-    Attributes
-    ----------
-    score_ : float
-        The MSE in the final doubly robust potential outcome regressions, i.e.
-
-        .. math::
-            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
-
-        where n_t is the number of treatments (excluding control).
-
-        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
-
-
-    """
-
-    def __init__(self, *,
-                 model_propensity='auto',
-                 model_regression='auto',
-                 model_final=StatsModelsLinearRegression(),
-                 multitask_model_final=False,
-                 featurizer=None,
-                 min_propensity=1e-6,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.model_propensity = clone(model_propensity, safe=False)
-        self.model_regression = clone(model_regression, safe=False)
-        self.model_final = clone(model_final, safe=False)
-        self.multitask_model_final = multitask_model_final
-        self.featurizer = clone(featurizer, safe=False)
-        self.min_propensity = min_propensity
-        super().__init__(cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         discrete_treatment=True,
-                         discrete_instrument=False,  # no instrument, so doesn't matter
-                         categories=categories,
-                         random_state=random_state)
-
-    def _get_inference_options(self):
-        options = super()._get_inference_options()
-        if not self.multitask_model_final:
-            options.update(auto=GenericModelFinalInferenceDiscrete)
-        else:
-            options.update(auto=lambda: None)
-        return options
-
-    def _gen_ortho_learner_model_nuisance(self):
-        if self.model_propensity == 'auto':
-            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
-                                                    random_state=self.random_state)
-        else:
-            model_propensity = clone(self.model_propensity, safe=False)
-
-        if self.model_regression == 'auto':
-            model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state)
-        else:
-            model_regression = clone(self.model_regression, safe=False)
-
-        return _ModelNuisance(model_propensity, model_regression, self.min_propensity)
-
-    def _gen_featurizer(self):
-        return clone(self.featurizer, safe=False)
-
-    def _gen_model_final(self):
-        return clone(self.model_final, safe=False)
-
-    def _gen_ortho_learner_model_final(self):
-        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference='auto'):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n,) vector of length n
-            Outcomes for each sample
-        T: (n,) vector of length n
-            Treatments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`).
-
-        Returns
-        -------
-        self: DRLearner instance
-        """
-        # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
-        return super().fit(Y, T, X=X, W=W,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def refit_final(self, *, inference='auto'):
-        return super().refit_final(inference=inference)
-    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
-
-    def score(self, Y, T, X=None, W=None):
-        """
-        Score the fitted CATE model on a new data set. Generates nuisance parameters
-        for the new data set based on the fitted residual nuisance models created at fit time.
-        It uses the mean prediction of the models fitted by the different crossfit folds.
-        Then calculates the MSE of the final residual Y on residual T regression.
-
-        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
-
-        Parameters
-        ----------
-        Y: (n,) vector of length n
-            Outcomes for each sample
-        T: (n,) vector of length n
-            Treatments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-
-        Returns
-        -------
-        score: float
-            The MSE of the final CATE model on the new data.
-        """
-        # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
-        return super().score(Y, T, X=X, W=W)
-
-    @property
-    def multitask_model_cate(self):
-        """
-        Get the fitted final CATE model.
-
-        Returns
-        -------
-        multitask_model_cate: object of type(`model_final`)
-            An instance of the model_final object that was fitted after calling fit which corresponds whose
-            vector of outcomes correspond to the CATE model for each treatment, compared to baseline.
-            Available only when multitask_model_final=True.
-        """
-        if not self.ortho_learner_model_final_._multitask_model_final:
-            raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.")
-        return self.ortho_learner_model_final_.model_cate
-
-    def model_cate(self, T=1):
-        """
-        Get the fitted final CATE model.
-
-        Parameters
-        ----------
-        T: alphanumeric
-            The treatment with respect to which we want the fitted CATE model.
-
-        Returns
-        -------
-        model_cate: object of type(model_final)
-            An instance of the model_final object that was fitted after calling fit which corresponds
-            to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False.
-        """
-        if self.ortho_learner_model_final_._multitask_model_final:
-            raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.")
-        _, T = self._expand_treatments(None, T)
-        ind = inverse_onehot(T).item() - 1
-        assert ind >= 0, "No model was fitted for the control"
-        return self.ortho_learner_model_final_.models_cate[ind]
-
-    @property
-    def models_propensity(self):
-        """
-        Get the fitted propensity models.
-
-        Returns
-        -------
-        models_propensity: list of objects of type(`model_propensity`)
-            A list of instances of the `model_propensity` object. Each element corresponds to a crossfitting
-            fold and is the model instance that was fitted for that training fold.
-        """
-        return [mdl._model_propensity for mdl in super().models_nuisance_]
-
-    @property
-    def models_regression(self):
-        """
-        Get the fitted regression models.
-
-        Returns
-        -------
-        model_regression: list of objects of type(`model_regression`)
-            A list of instances of the model_regression object. Each element corresponds to a crossfitting
-            fold and is the model instance that was fitted for that training fold.
-        """
-        return [mdl._model_regression for mdl in super().models_nuisance_]
-
-    @property
-    def nuisance_scores_propensity(self):
-        """Gets the score for the propensity model on out-of-sample training data"""
-        return self.nuisance_scores_[0]
-
-    @property
-    def nuisance_scores_regression(self):
-        """Gets the score for the regression model on out-of-sample training data"""
-        return self.nuisance_scores_[1]
-
-    @property
-    def featurizer_(self):
-        """
-        Get the fitted featurizer.
-
-        Returns
-        -------
-        featurizer: object of type(`featurizer`)
-            An instance of the fitted featurizer that was used to preprocess X in the final CATE model training.
-            Available only when featurizer is not None and X is not None.
-        """
-        return self.ortho_learner_model_final_._featurizer
-
-    def cate_feature_names(self, feature_names=None):
-        """
-        Get the output feature names.
-
-        Parameters
-        ----------
-        feature_names: list of strings of length X.shape[1] or None
-            The names of the input features. If None and X is a dataframe, it defaults to the column names
-            from the dataframe.
-
-        Returns
-        -------
-        out_feature_names: list of strings or None
-            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
-            final CATE model for each treatment is linear. It is the names of the features that are associated
-            with each entry of the :meth:`coef_` parameter. Available only when the featurizer is not None and has
-            a method: `get_feature_names(feature_names)`. Otherwise None is returned.
-        """
-        if self._d_x is None:
-            # Handles the corner case when X=None but featurizer might be not None
-            return None
-        if feature_names is None:
-            feature_names = self._input_names["feature_names"]
-        if self.featurizer_ is None:
-            return feature_names
-        elif hasattr(self.featurizer_, 'get_feature_names'):
-            # This fails if X=None and featurizer is not None, but that case is handled above
-            return self.featurizer_.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
-
-    @property
-    def model_final_(self):
-        return self.ortho_learner_model_final_._model_final
-
-    @property
-    def fitted_models_final(self):
-        return self.ortho_learner_model_final_.models_cate
-
-    def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
-        if self.featurizer_ is not None:
-            F = self.featurizer_.transform(X)
-        else:
-            F = X
-        feature_names = self.cate_feature_names(feature_names)
-
-        if self.ortho_learner_model_final_._multitask_model_final:
-            return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, F,
-                                                      self._d_t, self._d_y,
-                                                      feature_names=feature_names,
-                                                      treatment_names=treatment_names,
-                                                      output_names=output_names,
-                                                      input_names=self._input_names,
-                                                      background_samples=background_samples)
-        else:
-            return _shap_explain_model_cate(self.const_marginal_effect, self.fitted_models_final,
-                                            F, self._d_t, self._d_y,
-                                            feature_names=feature_names,
-                                            treatment_names=treatment_names,
-                                            output_names=output_names,
-                                            input_names=self._input_names,
-                                            background_samples=background_samples)
-    shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__
-
-
-class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
-    """
-    Special case of the :class:`.DRLearner` where the final stage
-    is a Linear Regression on a low dimensional set of features. In this case, inference
-    can be performed via the asymptotic normal characterization of the estimated parameters.
-    This is computationally faster than bootstrap inference. To do this, just leave the setting ``inference='auto'``
-    unchanged, or explicitly set ``inference='statsmodels'`` or alter the covariance type calculation via
-    ``inference=StatsModelsInferenceDiscrete(cov_type='HC1)``.
-
-    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:
-
-    .. math ::
-        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t
-
-    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
-    is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
-    running a standard ordinary linear regression (OLS), regressing the doubly robust outcome differences on X:
-
-    .. math ::
-        \\min_{\\theta_t, \\beta_t}\
-        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
-            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]
-
-    Then inference can be performed via standard approaches for inference of OLS, via asympotic normal approximations
-    of the estimated parameters. The default covariance estimator used is heteroskedasticity robust (HC1).
-    For other methods see :class:`.StatsModelsInferenceDiscrete`. Use can invoke them by setting:
-    ``inference=StatsModelsInferenceDiscrete(cov_type=...)``.
-
-    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
-    inference on the best linear approximation of the CATE model.
-
-    Parameters
-    ----------
-    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
-        where T is a shape (n, ) array.
-        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
-
-    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
-        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
-        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
-        `predict` methods. If different models per treatment arm are desired, see the
-        :class:`.MultiModelWrapper` helper class.
-        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    min_propensity : float, optional, default ``1e-6``
-        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional (default is 2)
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(X,T)` to generate the splits.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    Examples
-    --------
-    A simple example with the default models:
-
-    .. testcode::
-
-        import numpy as np
-        import scipy.special
-        from econml.drlearner import DRLearner, LinearDRLearner
-
-        np.set_printoptions(suppress=True)
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
-        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
-        est = LinearDRLearner()
-        est.fit(y, T, X=X, W=None)
-
-    >>> est.effect(X[:3])
-    array([ 0.409743...,  0.312604..., -0.127394...])
-    >>> est.effect_interval(X[:3])
-    (array([ 0.120682..., -0.102543..., -0.663246...]), array([0.698803..., 0.727753..., 0.408458...]))
-    >>> est.coef_(T=1)
-    array([ 0.450779..., -0.003214... ,  0.063884... ])
-    >>> est.coef__interval(T=1)
-    (array([ 0.202646..., -0.207195..., -0.104558...]), array([0.698911..., 0.200767..., 0.232326...]))
-    >>> est.intercept_(T=1)
-    0.88425066...
-    >>> est.intercept__interval(T=1)
-    (0.68655813..., 1.08194320...)
-
-    Attributes
-    ----------
-    score_ : float
-        The MSE in the final doubly robust potential outcome regressions, i.e.
-
-        .. math::
-            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
-
-        where n_t is the number of treatments (excluding control).
-
-        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
-
-    """
-
-    def __init__(self, *,
-                 model_propensity='auto',
-                 model_regression='auto',
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 min_propensity=1e-6,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.fit_cate_intercept = fit_cate_intercept
-        super().__init__(model_propensity=model_propensity,
-                         model_regression=model_regression,
-                         model_final=None,
-                         featurizer=featurizer,
-                         multitask_model_final=False,
-                         min_propensity=min_propensity,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_model_final(self):
-        return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept)
-
-    def _gen_ortho_learner_model_final(self):
-        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference='auto'):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n,) vector of length n
-            Outcomes for each sample
-        T: (n,) vector of length n
-            Treatments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports ``'bootstrap'``
-            (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'``
-            (or an instance of :class:`.StatsModelsInferenceDiscrete`).
-
-        Returns
-        -------
-        self: DRLearner instance
-        """
-        # Replacing fit from DRLearner, to add statsmodels inference in docstring
-        return super().fit(Y, T, X=X, W=W,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    @property
-    def fit_cate_intercept_(self):
-        return self.model_final_.fit_intercept
-
-    @property
-    def multitask_model_cate(self):
-        # Replacing this method which is invalid for this class, so that we make the
-        # dosctring empty and not appear in the docs.
-        return super().multitask_model_cate
-
-    @property
-    def multitask_model_final(self):
-        return False
-
-    @multitask_model_final.setter
-    def multitask_model_final(self, value):
-        if value:
-            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
-
-    @property
-    def model_final(self):
-        return self._gen_model_final()
-
-    @model_final.setter
-    def model_final(self, model):
-        if model is not None:
-            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
-
-
-class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
-    """
-    Special case of the :class:`.DRLearner` where the final stage
-    is a Debiased Lasso Regression. In this case, inference can be performed via the debiased lasso approach
-    and its asymptotic normal characterization of the estimated parameters. This is computationally
-    faster than bootstrap inference. Leave the default ``inference='auto'`` unchanged, or explicitly set
-    ``inference='debiasedlasso'`` at fit time to enable inference via asymptotic normality.
-
-    More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:
-
-    .. math ::
-        \\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t
-
-    where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
-    is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
-    running a debiased lasso regression (i.e. :math:`\\ell_1`-penalized regression with debiasing),
-    regressing the doubly robust outcome differences on X: i.e. first solves the penalized square loss problem
-
-    .. math ::
-        \\min_{\\theta_t, \\beta_t}\
-        E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
-            - \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]\
-                + \\lambda \\left\\lVert \\theta_t \\right\\rVert_1
-
-    and then adds a debiasing correction to the solution. If alpha='auto' (recommended), then the penalty
-    weight :math:`\\lambda` is set optimally via cross-validation.
-
-    This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
-    inference on the best sparse linear approximation of the CATE model.
-
-    Parameters
-    ----------
-    model_propensity : scikit-learn classifier or 'auto', optional (default='auto')
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
-        where T is a shape (n, ) array.
-        If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.
-
-    model_regression : scikit-learn regressor or 'auto', optional (default='auto')
-        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
-        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
-        `predict` methods. If different models per treatment arm are desired, see the
-        :class:`.MultiModelWrapper` helper class.
-        If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    alpha: string | float, optional., default 'auto'.
-        CATE L1 regularization applied through the debiased lasso in the final model.
-        'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
-
-    n_alphas : int, optional, default 100
-        How many alphas to try if alpha='auto'
-
-    alpha_cov : string | float, optional, default 'auto'
-        The regularization alpha that is used when constructing the pseudo inverse of
-        the covariance matrix Theta used to for correcting the final state lasso coefficient
-        in the debiased lasso. Each such regression corresponds to the regression of one feature
-        on the remainder of the features.
-
-    n_alphas_cov : int, optional, default 10
-        How many alpha_cov to try if alpha_cov='auto'.
-
-    max_iter : int, optional, default 1000
-        The maximum number of iterations in the Debiased Lasso
-
-    tol : float, optional, default 1e-4
-        The tolerance for the optimization: if the updates are
-        smaller than ``tol``, the optimization code checks the
-        dual gap for optimality and continues until it is smaller
-        than ``tol``.
-
-    n_jobs : int or None, optional (default=None)
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
-        ``-1`` means using all processors.
-
-    min_propensity : float, optional, default ``1e-6``
-        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional, default 2
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(X,T)` to generate the splits.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    Examples
-    --------
-    A simple example with the default models:
-
-    .. testcode::
-
-        import numpy as np
-        import scipy.special
-        from econml.drlearner import DRLearner, SparseLinearDRLearner
-
-        np.set_printoptions(suppress=True)
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
-        y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
-        est = SparseLinearDRLearner()
-        est.fit(y, T, X=X, W=None)
-
-    >>> est.effect(X[:3])
-    array([ 0.41...,  0.31..., -0.12...])
-    >>> est.effect_interval(X[:3])
-    (array([ 0.04..., -0.19..., -0.73...]), array([0.77..., 0.82..., 0.47...]))
-    >>> est.coef_(T=1)
-    array([ 0.45..., -0.00..., 0.06...])
-    >>> est.coef__interval(T=1)
-    (array([ 0.24... , -0.19..., -0.13...]), array([0.65..., 0.19..., 0.26...]))
-    >>> est.intercept_(T=1)
-    0.88...
-    >>> est.intercept__interval(T=1)
-    (0.68..., 1.08...)
-
-    Attributes
-    ----------
-    score_ : float
-        The MSE in the final doubly robust potential outcome regressions, i.e.
-
-        .. math::
-            \\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2
-
-        where n_t is the number of treatments (excluding control).
-
-        If `sample_weight` is not None at fit time, then a weighted average across samples is returned.
-
-    """
-
-    def __init__(self, *,
-                 model_propensity='auto',
-                 model_regression='auto',
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 alpha='auto',
-                 n_alphas=100,
-                 alpha_cov='auto',
-                 n_alphas_cov=10,
-                 max_iter=1000,
-                 tol=1e-4,
-                 n_jobs=None,
-                 min_propensity=1e-6,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.fit_cate_intercept = fit_cate_intercept
-        self.alpha = alpha
-        self.n_alphas = n_alphas
-        self.alpha_cov = alpha_cov
-        self.n_alphas_cov = n_alphas_cov
-        self.max_iter = max_iter
-        self.tol = tol
-        self.n_jobs = n_jobs
-        super().__init__(model_propensity=model_propensity,
-                         model_regression=model_regression,
-                         model_final=None,
-                         featurizer=featurizer,
-                         multitask_model_final=False,
-                         min_propensity=min_propensity,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_model_final(self):
-        return DebiasedLasso(alpha=self.alpha,
-                             n_alphas=self.n_alphas,
-                             alpha_cov=self.alpha_cov,
-                             n_alphas_cov=self.n_alphas_cov,
-                             fit_intercept=self.fit_cate_intercept,
-                             max_iter=self.max_iter,
-                             tol=self.tol,
-                             n_jobs=self.n_jobs,
-                             random_state=self.random_state)
-
-    def _gen_ortho_learner_model_final(self):
-        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference='auto'):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n,) vector of length n
-            Outcomes for each sample
-        T: (n,) vector of length n
-            Treatments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports ``'bootstrap'``
-            (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
-            (or an instance of :class:`.LinearModelInferenceDiscrete`).
-
-        Returns
-        -------
-        self: DRLearner instance
-        """
-        # Replacing fit from DRLearner, to add debiasedlasso inference in docstring
-        # TODO: support sample_var
-        if sample_weight is not None and inference is not None:
-            warn("This estimator does not yet support sample variances and inference does not take "
-                 "sample variances into account. This feature will be supported in a future release.")
-        check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
-                               discrete_treatment=self.discrete_treatment,
-                               msg="The number of features in the final model (< 5) is too small for a sparse model. "
-                               "We recommend using the LinearDRLearner for this low-dimensional setting.")
-        return super().fit(Y, T, X=X, W=W,
-                           sample_weight=sample_weight, sample_var=None, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    @property
-    def fit_cate_intercept_(self):
-        return self.model_final_.fit_intercept
-
-    @property
-    def multitask_model_final(self):
-        return False
-
-    @multitask_model_final.setter
-    def multitask_model_final(self, value):
-        if value:
-            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
-
-    @property
-    def model_final(self):
-        return self._gen_model_final()
-
-    @model_final.setter
-    def model_final(self, model):
-        if model is not None:
-            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
-
-
-class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
-    """ Instance of DRLearner with a :class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
-    as a final model, so as to enable non-parametric inference.
-
-    Parameters
-    ----------
-    model_propensity : scikit-learn classifier
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-        Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
-        where T is a shape (n, ) array.
-
-    model_regression : scikit-learn regressor
-        Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
-        concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
-        `predict` methods. If different models per treatment arm are desired, see the
-        :class:`~econml.utilities.MultiModelWrapper` helper class.
-
-    min_propensity : float, optional, default ``1e-6``
-        The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional (Default=2)
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    n_crossfit_splits: int or 'raise', optional (default='raise')
-        Deprecated by parameter `cv` and will be removed in next version. Can be used
-        interchangeably with `cv`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    n_estimators : integer, optional (default=100)
-        The total number of trees in the forest. The forest consists of a
-        forest of sqrt(n_estimators) sub-forests, where each sub-forest
-        contains sqrt(n_estimators) trees.
-
-    criterion : string, optional (default="mse")
-        The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion, and "mae" for the mean
-        absolute error.
-
-    max_depth : integer or None, optional (default=None)
-        The maximum depth of the tree. If None, then nodes are expanded until
-        all leaves are pure or until all leaves contain less than
-        min_samples_split samples.
-
-    min_samples_split : int, float, optional (default=2)
-        The minimum number of splitting samples required to split an internal node.
-
-        - If int, then consider `min_samples_split` as the minimum number.
-        - If float, then `min_samples_split` is a fraction and
-          `ceil(min_samples_split * n_samples)` are the minimum
-          number of samples for each split.
-
-    min_samples_leaf : int, float, optional (default=1)
-        The minimum number of samples required to be at a leaf node.
-        A split point at any depth will only be considered if it leaves at
-        least ``min_samples_leaf`` splitting samples in each of the left and
-        right branches.  This may have the effect of smoothing the model,
-        especially in regression. After construction the tree is also pruned
-        so that there are at least min_samples_leaf estimation samples on
-        each leaf.
-
-        - If int, then consider `min_samples_leaf` as the minimum number.
-        - If float, then `min_samples_leaf` is a fraction and
-          `ceil(min_samples_leaf * n_samples)` are the minimum
-          number of samples for each node.
-
-    min_weight_fraction_leaf : float, optional (default=0.)
-        The minimum weighted fraction of the sum total of weights (of all
-        splitting samples) required to be at a leaf node. Samples have
-        equal weight when sample_weight is not provided. After construction
-        the tree is pruned so that the fraction of the sum total weight
-        of the estimation samples contained in each leaf node is at
-        least min_weight_fraction_leaf
-
-    max_features : int, float, string or None, optional (default="auto")
-        The number of features to consider when looking for the best split:
-
-        - If int, then consider `max_features` features at each split.
-        - If float, then `max_features` is a fraction and
-          `int(max_features * n_features)` features are considered at each
-          split.
-        - If "auto", then `max_features=n_features`.
-        - If "sqrt", then `max_features=sqrt(n_features)`.
-        - If "log2", then `max_features=log2(n_features)`.
-        - If None, then `max_features=n_features`.
-
-        Note: the search for a split does not stop until at least one
-        valid partition of the node samples is found, even if it requires to
-        effectively inspect more than ``max_features`` features.
-
-    max_leaf_nodes : int or None, optional (default=None)
-        Grow trees with ``max_leaf_nodes`` in best-first fashion.
-        Best nodes are defined as relative reduction in impurity.
-        If None then unlimited number of leaf nodes.
-
-    min_impurity_decrease : float, optional (default=0.)
-        A node will be split if this split induces a decrease of the impurity
-        greater than or equal to this value.
-
-        The weighted impurity decrease equation is the following::
-
-            N_t / N * (impurity - N_t_R / N_t * right_impurity
-                                - N_t_L / N_t * left_impurity)
-
-        where ``N`` is the total number of split samples, ``N_t`` is the number of
-        split samples at the current node, ``N_t_L`` is the number of split samples in the
-        left child, and ``N_t_R`` is the number of split samples in the right child.
-
-        ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
-        if ``sample_weight`` is passed.
-
-    subsample_fr : float or 'auto', optional (default='auto')
-        The fraction of the half-samples that are used on each tree. Each tree
-        will be built on subsample_fr * n_samples/2.
-
-        If 'auto', then the subsampling fraction is set to::
-
-            (n_samples/2)**(1-1/(2*n_features+2))/(n_samples/2)
-
-        which is sufficient to guarantee asympotitcally valid inference.
-
-    honest : boolean, optional (default=True)
-        Whether to use honest trees, i.e. half of the samples are used for
-        creating the tree structure and the other half for the estimation at
-        the leafs. If False, then all samples are used for both parts.
-
-    n_jobs : int or None, optional (default=None)
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-    verbose : int, optional (default=0)
-        Controls the verbosity when fitting and predicting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, *,
-                 model_regression="auto",
-                 model_propensity="auto",
-                 featurizer=None,
-                 min_propensity=1e-6,
-                 categories='auto',
-                 cv=2,
-                 n_crossfit_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 n_estimators=1000,
-                 criterion='deprecated',
-                 max_depth=None,
-                 min_samples_split=5,
-                 min_samples_leaf=5,
-                 min_weight_fraction_leaf=0.,
-                 max_features="auto",
-                 max_leaf_nodes='deprecated',
-                 min_impurity_decrease=0.,
-                 subsample_fr='deprecated',
-                 max_samples=.45,
-                 min_balancedness_tol=.45,
-                 honest=True,
-                 subforest_size=4,
-                 n_jobs=-1,
-                 verbose=0,
-                 random_state=None):
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.min_samples_split = min_samples_split
-        self.min_samples_leaf = min_samples_leaf
-        self.min_weight_fraction_leaf = min_weight_fraction_leaf
-        self.max_features = max_features
-        self.min_impurity_decrease = min_impurity_decrease
-        self.max_samples = max_samples
-        self.min_balancedness_tol = min_balancedness_tol
-        self.honest = honest
-        self.subforest_size = subforest_size
-        self.n_jobs = n_jobs
-        self.verbose = verbose
-        self.n_crossfit_splits = n_crossfit_splits
-        if self.n_crossfit_splits != 'raise':
-            cv = self.n_crossfit_splits
-        self.subsample_fr = subsample_fr
-        self.max_leaf_nodes = max_leaf_nodes
-        self.criterion = criterion
-        super().__init__(model_regression=model_regression,
-                         model_propensity=model_propensity,
-                         model_final=None,
-                         featurizer=featurizer,
-                         multitask_model_final=False,
-                         min_propensity=min_propensity,
-                         categories=categories,
-                         cv=cv,
-                         n_splits='raise',
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_model_final(self):
-        return RegressionForest(n_estimators=self.n_estimators,
-                                max_depth=self.max_depth,
-                                min_samples_split=self.min_samples_split,
-                                min_samples_leaf=self.min_samples_leaf,
-                                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
-                                max_features=self.max_features,
-                                min_impurity_decrease=self.min_impurity_decrease,
-                                max_samples=self.max_samples,
-                                min_balancedness_tol=self.min_balancedness_tol,
-                                honest=self.honest,
-                                inference=True,
-                                subforest_size=self.subforest_size,
-                                n_jobs=self.n_jobs,
-                                random_state=self.random_state,
-                                verbose=self.verbose,
-                                warm_start=False)
-
-    def _gen_ortho_learner_model_final(self):
-        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference='auto'):
-        """
-        Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
-
-        Parameters
-        ----------
-        Y: (n × d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n × dₜ) matrix or vector of length n
-            Treatments for each sample
-        X: optional (n × dₓ) matrix
-            Features for each sample
-        W: optional (n × d_w) matrix
-            Controls for each sample
-        sample_weight: optional (n,) vector
-            Weights for each row
-        sample_var: optional (n, n_y) vector
-            Variance of sample, in case it corresponds to summary of many samples. Currently
-            not in use by this method (as inference method does not require sample variance info).
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string, `Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`) and 'blb'
-            (for Bootstrap-of-Little-Bags based inference)
-
-        Returns
-        -------
-        self
-        """
-        return super().fit(Y, T, X=X, W=W,
-                           sample_weight=sample_weight, sample_var=None, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def multitask_model_cate(self):
-        # Replacing to remove docstring
-        super().multitask_model_cate()
-
-    @property
-    def multitask_model_final(self):
-        return False
-
-    @multitask_model_final.setter
-    def multitask_model_final(self, value):
-        if value:
-            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
-
-    @property
-    def model_final(self):
-        return self._gen_model_final()
-
-    @model_final.setter
-    def model_final(self, model):
-        if model is not None:
-            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
-
-    ####################################################################
-    # Everything below should be removed once parameters are deprecated
-    ####################################################################
-
-    @property
-    def n_crossfit_splits(self):
-        return self.cv
-
-    @n_crossfit_splits.setter
-    def n_crossfit_splits(self, value):
-        if value != 'raise':
-            warn("Deprecated by parameter `cv` and will be removed in next version.")
-        self.cv = value
-
-    @property
-    def criterion(self):
-        return self.criterion
-
-    @criterion.setter
-    def criterion(self, value):
-        if value != 'deprecated':
-            warn("The parameter 'criterion' has been deprecated and will be removed in the next version. "
-                 "Only the 'mse' criterion is supported.")
-
-    @property
-    def max_leaf_nodes(self):
-        return self.max_leaf_nodes
-
-    @max_leaf_nodes.setter
-    def max_leaf_nodes(self, value):
-        if value != 'deprecated':
-            warn("The parameter 'max_leaf_nodes' has been deprecated and will be removed in the next version.")
-
-    @property
-    def subsample_fr(self):
-        return 2 * self.max_samples
-
-    @subsample_fr.setter
-    def subsample_fr(self, value):
-        if value != 'deprecated':
-            warn("The parameter 'subsample_fr' has been deprecated and will be removed in the next version. "
-                 "Use 'max_samples' instead, with the convention that "
-                 "'subsample_fr=x' is equivalent to 'max_samples=x/2'.")
-            self.max_samples = .45 if value == 'auto' else value / 2
+@deprecated("The econml.drlearner.ForestDRLearner class has been moved to econml.dr.ForestDRLearner; "
+            "an upcoming release will remove support for the old name")
+class ForestDRLearner(dr.ForestDRLearner):
+    pass
diff --git a/econml/inference/__init__.py b/econml/inference/__init__.py
new file mode 100644
index 00000000..921b6b4f
--- /dev/null
+++ b/econml/inference/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._inference import (BootstrapInference, GenericModelFinalInference, GenericSingleTreatmentModelFinalInference,
+                         LinearModelFinalInference, StatsModelsInference, GenericModelFinalInferenceDiscrete,
+                         LinearModelFinalInferenceDiscrete, StatsModelsInferenceDiscrete,
+                         NormalInferenceResults, EmpiricalInferenceResults,
+                         PopulationSummaryResults)
+
+__all__ = ["BootstrapInference",
+           "GenericModelFinalInference",
+           "GenericSingleTreatmentModelFinalInference",
+           "LinearModelFinalInference",
+           "StatsModelsInference",
+           "GenericModelFinalInferenceDiscrete",
+           "LinearModelFinalInferenceDiscrete",
+           "StatsModelsInferenceDiscrete",
+           "NormalInferenceResults",
+           "EmpiricalInferenceResults",
+           "PopulationSummaryResults"]
diff --git a/econml/inference/_bootstrap.py b/econml/inference/_bootstrap.py
new file mode 100644
index 00000000..bbd5abb9
--- /dev/null
+++ b/econml/inference/_bootstrap.py
@@ -0,0 +1,290 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Bootstrap sampling."""
+import numpy as np
+from joblib import Parallel, delayed
+from sklearn.base import clone
+from scipy.stats import norm
+from collections import OrderedDict
+import pandas as pd
+
+
+class BootstrapEstimator:
+    """Estimator that uses bootstrap sampling to wrap an existing estimator.
+
+    This estimator provides a `fit` method with the same signature as the wrapped estimator.
+
+    The bootstrap estimator will also wrap all other methods and attributes of the wrapped estimator,
+    but return the average of the sampled calculations (this will fail for non-numeric outputs).
+
+    It will also provide a wrapper method suffixed with `_interval` for each method or attribute of
+    the wrapped estimator that takes two additional optional keyword arguments `lower` and `upper` specifiying
+    the percentiles of the interval, and which uses `np.percentile` to return the corresponding lower
+    and upper bounds based on the sampled calculations.  For example, if the underlying estimator supports
+    an `effect` method with signature `(X,T) -> Y`, this class will provide a method `effect_interval`
+    with pseudo-signature `(lower=5, upper=95, X, T) -> (Y, Y)` (where `lower` and `upper` cannot be
+    supplied as positional arguments).
+
+    Parameters
+    ----------
+    wrapped : object
+        The basis for the clones used for estimation.
+        This object must support a `fit` method which takes numpy arrays with consistent first dimensions
+        as arguments.
+
+    n_bootstrap_samples : int, default: 100
+        How many draws to perform.
+
+    n_jobs: int, default: None
+        The maximum number of concurrently running jobs, as in joblib.Parallel.
+
+    verbose: int, default: 0
+        Verbosity level
+
+    compute_means : bool, default: True
+        Whether to pass calls through to the underlying collection and return the mean.  Setting this
+        to ``False`` can avoid ambiguities if the wrapped object itself has method names with an `_interval` suffix.
+
+    bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
+        Bootstrap method used to compute results.  'percentile' will result in using the empiracal CDF of
+        the replicated computations of the statistics.   'pivot' will also use the replicates but create a pivot
+        interval that also relies on the estimate over the entire dataset.  'normal' will instead compute an interval
+        assuming the replicates are normally distributed.
+    """
+
+    def __init__(self, wrapped,
+                 n_bootstrap_samples=100,
+                 n_jobs=None,
+                 verbose=0,
+                 compute_means=True,
+                 bootstrap_type='pivot'):
+        self._instances = [clone(wrapped, safe=False) for _ in range(n_bootstrap_samples)]
+        self._n_bootstrap_samples = n_bootstrap_samples
+        self._n_jobs = n_jobs
+        self._verbose = verbose
+        self._compute_means = compute_means
+        self._bootstrap_type = bootstrap_type
+        self._wrapped = wrapped
+
+    # TODO: Add a __dir__ implementation?
+
+    @staticmethod
+    def __stratified_indices(arr):
+        assert 1 <= np.ndim(arr) <= 2
+        unique = np.unique(arr, axis=0)
+        indices = []
+        for el in unique:
+            ind, = np.where(np.all(arr == el, axis=1) if np.ndim(arr) == 2 else arr == el)
+            indices.append(ind)
+        return indices
+
+    def fit(self, *args, **named_args):
+        """
+        Fit the model.
+
+        The full signature of this method is the same as that of the wrapped object's `fit` method.
+        """
+        from .._cate_estimator import BaseCateEstimator  # need to nest this here to avoid circular import
+
+        index_chunks = None
+        if isinstance(self._instances[0], BaseCateEstimator):
+            index_chunks = self._instances[0]._strata(*args, **named_args)
+            if index_chunks is not None:
+                index_chunks = self.__stratified_indices(index_chunks)
+        if index_chunks is None:
+            n_samples = np.shape(args[0] if args else named_args[(*named_args,)[0]])[0]
+            index_chunks = [np.arange(n_samples)]  # one chunk with all indices
+
+        indices = []
+        for chunk in index_chunks:
+            n_samples = len(chunk)
+            indices.append(chunk[np.random.choice(n_samples,
+                                                  size=(self._n_bootstrap_samples, n_samples),
+                                                  replace=True)])
+
+        indices = np.hstack(indices)
+
+        def fit(x, *args, **kwargs):
+            x.fit(*args, **kwargs)
+            return x  # Explicitly return x in case fit fails to return its target
+
+        def convertArg(arg, inds):
+            if arg is None:
+                return None
+            arr = np.asarray(arg)
+            if arr.ndim > 0:
+                return arr[inds]
+            else:  # arg was a scalar, so we shouldn't have converted it
+                return arg
+
+        self._instances = Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=self._verbose)(
+            delayed(fit)(obj,
+                         *[convertArg(arg, inds) for arg in args],
+                         **{arg: convertArg(named_args[arg], inds) for arg in named_args})
+            for obj, inds in zip(self._instances, indices)
+        )
+        return self
+
+    def __getattr__(self, name):
+        """
+        Get proxy attribute that wraps the corresponding attribute with the same name from the wrapped object.
+
+        Additionally, the suffix "_interval" is supported for getting an interval instead of a point estimate.
+        """
+
+        # don't proxy special methods
+        if name.startswith('__'):
+            raise AttributeError(name)
+
+        def proxy(make_call, name, summary):
+            def summarize_with(f):
+                results = np.array(Parallel(n_jobs=self._n_jobs, prefer='threads', verbose=self._verbose)(
+                    (f, (obj, name), {}) for obj in self._instances)), f(self._wrapped, name)
+                return summary(*results)
+            if make_call:
+                def call(*args, **kwargs):
+                    return summarize_with(lambda obj, name: getattr(obj, name)(*args, **kwargs))
+                return call
+            else:
+                return summarize_with(lambda obj, name: getattr(obj, name))
+
+        def get_mean():
+            # for attributes that exist on the wrapped object, just compute the mean of the wrapped calls
+            return proxy(callable(getattr(self._instances[0], name)), name, lambda arr, _: np.mean(arr, axis=0))
+
+        def get_std():
+            prefix = name[: - len('_std')]
+            return proxy(callable(getattr(self._instances[0], prefix)), prefix,
+                         lambda arr, _: np.std(arr, axis=0))
+
+        def get_interval():
+            # if the attribute exists on the wrapped object once we remove the suffix,
+            # then we should be computing a confidence interval for the wrapped calls
+            prefix = name[: - len("_interval")]
+
+            def call_with_bounds(can_call, lower, upper):
+                def percentile_bootstrap(arr, _):
+                    return np.percentile(arr, lower, axis=0), np.percentile(arr, upper, axis=0)
+
+                def pivot_bootstrap(arr, est):
+                    return 2 * est - np.percentile(arr, upper, axis=0), 2 * est - np.percentile(arr, lower, axis=0)
+
+                def normal_bootstrap(arr, est):
+                    std = np.std(arr, axis=0)
+                    return est - norm.ppf(upper / 100) * std, est - norm.ppf(lower / 100) * std
+
+                # TODO: studentized bootstrap? this would be more accurate in most cases but can we avoid
+                #       second level bootstrap which would be prohibitive computationally?
+
+                fn = {'percentile': percentile_bootstrap,
+                      'normal': normal_bootstrap,
+                      'pivot': pivot_bootstrap}[self._bootstrap_type]
+                return proxy(can_call, prefix, fn)
+
+            can_call = callable(getattr(self._instances[0], prefix))
+            if can_call:
+                # collect extra arguments and pass them through, if the wrapped attribute was callable
+                def call(*args, lower=5, upper=95, **kwargs):
+                    return call_with_bounds(can_call, lower, upper)(*args, **kwargs)
+                return call
+            else:
+                # don't pass extra arguments if the wrapped attribute wasn't callable to begin with
+                def call(lower=5, upper=95):
+                    return call_with_bounds(can_call, lower, upper)
+                return call
+
+        def get_inference():
+            # can't import from econml.inference at top level without creating cyclical dependencies
+            from ._inference import EmpiricalInferenceResults, NormalInferenceResults
+            from .._cate_estimator import LinearModelFinalCateEstimatorDiscreteMixin
+
+            prefix = name[: - len("_inference")]
+
+            def fname_transformer(x):
+                return x
+
+            if prefix in ['const_marginal_effect', 'marginal_effect', 'effect']:
+                inf_type = 'effect'
+            elif prefix == 'coef_':
+                inf_type = 'coefficient'
+                if (hasattr(self._instances[0], 'cate_feature_names') and
+                        callable(self._instances[0].cate_feature_names)):
+                    def fname_transformer(x):
+                        return self._instances[0].cate_feature_names(x)
+            elif prefix == 'intercept_':
+                inf_type = 'intercept'
+            else:
+                raise AttributeError("Unsupported inference: " + name)
+
+            d_t = self._wrapped._d_t[0] if self._wrapped._d_t else 1
+            if prefix == 'effect' or (isinstance(self._wrapped, LinearModelFinalCateEstimatorDiscreteMixin) and
+                                      (inf_type == 'coefficient' or inf_type == 'intercept')):
+                d_t = 1
+            d_y = self._wrapped._d_y[0] if self._wrapped._d_y else 1
+
+            can_call = callable(getattr(self._instances[0], prefix))
+
+            kind = self._bootstrap_type
+            if kind == 'percentile' or kind == 'pivot':
+                def get_dist(est, arr):
+                    if kind == 'percentile':
+                        return arr
+                    elif kind == 'pivot':
+                        return 2 * est - arr
+                    else:
+                        raise ValueError("Invalid kind, must be either 'percentile' or 'pivot'")
+
+                def get_result():
+                    return proxy(can_call, prefix,
+                                 lambda arr, est: EmpiricalInferenceResults(
+                                     d_t=d_t, d_y=d_y,
+                                     pred=est, pred_dist=get_dist(est, arr),
+                                     inf_type=inf_type,
+                                     fname_transformer=fname_transformer,
+                                     **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None))
+
+                # Note that inference results are always methods even if the inference is for a property
+                # (e.g. coef__inference() is a method but coef_ is a property)
+                # Therefore we must insert a lambda if getting inference for a non-callable
+                return get_result() if can_call else get_result
+
+            else:
+                assert kind == 'normal'
+
+                def normal_inference(*args, **kwargs):
+                    pred = getattr(self._wrapped, prefix)
+                    if can_call:
+                        pred = pred(*args, **kwargs)
+                    stderr = getattr(self, prefix + '_std')
+                    if can_call:
+                        stderr = stderr(*args, **kwargs)
+                    return NormalInferenceResults(
+                        d_t=d_t, d_y=d_y, pred=pred,
+                        pred_stderr=stderr, inf_type=inf_type,
+                        fname_transformer=fname_transformer,
+                        **self._wrapped._input_names if hasattr(self._wrapped, "_input_names") else None)
+
+                # If inference is for a property, create a fresh lambda to avoid passing args through
+                return normal_inference if can_call else lambda: normal_inference()
+
+        caught = None
+        m = None
+        if name.endswith("_interval"):
+            m = get_interval
+        elif name.endswith("_std"):
+            m = get_std
+        elif name.endswith("_inference"):
+            m = get_inference
+
+        # try to get interval/std first if appropriate,
+        # since we don't prefer a wrapped method with this name
+        if m is not None:
+            try:
+                return m()
+            except AttributeError as err:
+                caught = err
+        if self._compute_means:
+            return get_mean()
+
+        raise (caught if caught else AttributeError(name))
diff --git a/econml/inference.py b/econml/inference/_inference.py
similarity index 99%
rename from econml/inference.py
rename to econml/inference/_inference.py
index 56f02338..229891c0 100644
--- a/econml/inference.py
+++ b/econml/inference/_inference.py
@@ -11,12 +11,12 @@ import scipy
 from scipy.stats import norm
 from statsmodels.iolib.table import SimpleTable
 
-from .bootstrap import BootstrapEstimator
-from .sklearn_extensions.linear_model import StatsModelsLinearRegression
-from .utilities import (Summary, _safe_norm_ppf, broadcast_unit_treatments,
-                        cross_product, inverse_onehot, ndim,
-                        parse_final_model_params,
-                        reshape_treatmentwise_effects, shape)
+from ._bootstrap import BootstrapEstimator
+from ..sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ..utilities import (Summary, _safe_norm_ppf, broadcast_unit_treatments,
+                         cross_product, inverse_onehot, ndim,
+                         parse_final_model_params,
+                         reshape_treatmentwise_effects, shape)
 
 """Options for performing inference in estimators."""
 
@@ -68,6 +68,9 @@ class BootstrapInference(Inference):
     n_jobs: int, optional (default -1)
         The maximum number of concurrently running jobs, as in joblib.Parallel.
 
+    verbose: int, default: 0
+        Verbosity level
+
     bootstrap_type: 'percentile', 'pivot', or 'normal', default 'pivot'
         Bootstrap method used to compute results.
         'percentile' will result in using the empiracal CDF of the replicated computations of the statistics.
@@ -76,14 +79,15 @@ class BootstrapInference(Inference):
         'normal' will instead compute a pivot interval assuming the replicates are normally distributed.
     """
 
-    def __init__(self, n_bootstrap_samples=100, n_jobs=-1, bootstrap_type='pivot'):
+    def __init__(self, n_bootstrap_samples=100, n_jobs=-1, bootstrap_type='pivot', verbose=0):
         self._n_bootstrap_samples = n_bootstrap_samples
         self._n_jobs = n_jobs
         self._bootstrap_type = bootstrap_type
+        self._verbose = verbose
 
     def fit(self, estimator, *args, **kwargs):
         est = BootstrapEstimator(estimator, self._n_bootstrap_samples, self._n_jobs, compute_means=False,
-                                 bootstrap_type=self._bootstrap_type)
+                                 bootstrap_type=self._bootstrap_type, verbose=self._verbose)
         est.fit(*args, **kwargs)
         self._est = est
         self._d_t = estimator._d_t
diff --git a/econml/iv/__init__.py b/econml/iv/__init__.py
new file mode 100644
index 00000000..6579a9f5
--- /dev/null
+++ b/econml/iv/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+__all__ = ["dml", "dr", "nnet", "sieve"]
diff --git a/econml/iv/_nuisance_wrappers.py b/econml/iv/_nuisance_wrappers.py
new file mode 100644
index 00000000..aa4ef02a
--- /dev/null
+++ b/econml/iv/_nuisance_wrappers.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+from sklearn.base import clone
+from ..utilities import (add_intercept, fit_with_groups,
+                         hstack, inverse_onehot)
+from ..dml.dml import _FinalWrapper as _DMLFinalWrapper
+
+# A cut-down version of the DML first stage wrapper, since we don't need to support linear first stages
+
+
+class _FirstStageWrapper:
+    def __init__(self, model, discrete_target):
+        self._model = clone(model, safe=False)
+        self._discrete_target = discrete_target
+
+    def _combine(self, X, W, Z, n_samples, fitting=True):
+        # output is
+        #   * a column of ones if X, W, and Z are all None
+        #   * just X or W or Z if both of the others are None
+        #   * hstack([arrs]) for whatever subset are not None otherwise
+
+        # ensure Z is 2D
+        if Z is not None:
+            Z = Z.reshape(n_samples, -1)
+
+        if X is None and W is None and Z is None:
+            return np.ones((n_samples, 1))
+
+        arrs = [arr for arr in [X, W, Z] if arr is not None]
+
+        if len(arrs) == 1:
+            return arrs[0]
+        else:
+            return hstack(arrs)
+
+    def fit(self, *, X, W, Target, Z=None, sample_weight=None, groups=None):
+        if self._discrete_target:
+            # In this case, the Target is the one-hot-encoding of the treatment variable
+            # We need to go back to the label representation of the one-hot so as to call
+            # the classifier.
+            if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
+                raise AttributeError("Provided crossfit folds contain training splits that " +
+                                     "don't contain all treatments")
+            Target = inverse_onehot(Target)
+
+        if sample_weight is not None:
+            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
+                            groups=groups, sample_weight=sample_weight)
+        else:
+            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
+                            groups=groups)
+
+    def score(self, *, X, W, Target, Z=None, sample_weight=None):
+        if hasattr(self._model, 'score'):
+            if self._discrete_target:
+                # In this case, the Target is the one-hot-encoding of the treatment variable
+                # We need to go back to the label representation of the one-hot so as to call
+                # the classifier.
+                if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
+                    raise AttributeError("Provided crossfit folds contain training splits that " +
+                                         "don't contain all treatments")
+                Target = inverse_onehot(Target)
+
+            if sample_weight is not None:
+                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target, sample_weight=sample_weight)
+            else:
+                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target)
+        else:
+            return None
+
+    def predict(self, X, W, Z=None):
+        arrs = [arr for arr in [X, W, Z] if arr is not None]
+        n_samples = arrs[0].shape[0] if arrs else 1
+        if self._discrete_target:
+            return self._model.predict_proba(self._combine(X, W, Z, n_samples, fitting=False))[:, 1:]
+        else:
+            return self._model.predict(self._combine(X, W, Z, n_samples, fitting=False))
+
+
+class _FinalWrapper(_DMLFinalWrapper):
+    pass
diff --git a/econml/iv/dml/__init__.py b/econml/iv/dml/__init__.py
new file mode 100644
index 00000000..ff42d62f
--- /dev/null
+++ b/econml/iv/dml/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+from ._dml import DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV
+
+__all__ = ["DMLATEIV",
+           "ProjectedDMLATEIV",
+           "DMLIV",
+           "NonParamDMLIV"]
diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py
new file mode 100644
index 00000000..b3c8b258
--- /dev/null
+++ b/econml/iv/dml/_dml.py
@@ -0,0 +1,931 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+from ..._ortho_learner import _OrthoLearner
+from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
+from ...inference import StatsModelsInference
+from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ...utilities import _deprecate_positional
+from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
+
+
+class _BaseDMLATEIVModelFinal:
+    def __init__(self):
+        self._first_stage = LinearRegression(fit_intercept=False)
+        self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False),
+                                          fit_cate_intercept=True, featurizer=None, use_weight_trick=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res, Z_res = nuisances
+        if Z_res.ndim == 1:
+            Z_res = Z_res.reshape(-1, 1)
+        # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res
+        T_res_pred = self._first_stage.fit(Z_res, T_res,
+                                           sample_weight=sample_weight).predict(Z_res)
+        # TODO: allow the final model to actually use X? Then we'd need to rename the class
+        #       since we would actually be calculating a CATE rather than ATE.
+        self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight)
+        return self
+
+    def predict(self, X=None):
+        # TODO: allow the final model to actually use X?
+        return self._model_final.predict(X=None)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res, Z_res = nuisances
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        # TODO: allow the final model to actually use X?
+        effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+        else:
+            return np.mean((Y_res - Y_res_pred) ** 2)
+
+
+class _BaseDMLATEIV(_OrthoLearner):
+    def __init__(self, discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLATEIVModelFinal()
+
+    @_deprecate_positional("W and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing W and Z by position.", ['W', 'Z'])
+    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLATEIV instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, W=W, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, W=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce X=None and improve the docstring
+        return super().score(Y, T, W=W, Z=Z)
+
+
+class _DMLATEIVModelNuisance:
+    def __init__(self, model_Y_W, model_T_W, model_Z_W):
+        self._model_Y_W = clone(model_Y_W, safe=False)
+        self._model_T_W = clone(model_T_W, safe=False)
+        self._model_Z_W = clone(model_Z_W, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        assert X is None, "DML ATE IV does not accept features"
+        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_Z_W.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        if hasattr(self._model_Y_W, 'score'):
+            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_W, 'score'):
+            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_Z_W, 'score'):
+            Z_X_score = self._model_Z_W.score(X=X, W=W, Target=Z, sample_weight=sample_weight)
+        else:
+            Z_X_score = None
+        return Y_X_score, T_X_score, Z_X_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        Y_pred = self._model_Y_W.predict(X=X, W=W)
+        T_pred = self._model_T_W.predict(X=X, W=W)
+        Z_pred = self._model_Z_W.predict(X=X, W=W)
+        if W is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
+            Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - T_pred.reshape(T.shape)
+        Z_res = Z - Z_pred.reshape(Z.shape)
+        return Y_res, T_res, Z_res
+
+
+class DMLATEIV(_BaseDMLATEIV):
+    """
+    Implementation of the orthogonal/double ml method for ATE estimation with
+    IV as described in
+
+    Double/Debiased Machine Learning for Treatment and Causal Parameters
+    Victor Chernozhukov, Denis Chetverikov, Mert Demirer, Esther Duflo, Christian Hansen, Whitney Newey, James Robins
+    https://arxiv.org/abs/1608.00060
+
+    Requires that either co-variance of T, Z is independent of X or that effect
+    is not heterogeneous in X for correct recovery. Otherwise it estimates
+    a biased ATE.
+    """
+
+    def __init__(self, *,
+                 model_Y_W,
+                 model_T_W,
+                 model_Z_W,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_Z_W = clone(model_Z_W, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _DMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_Z_W=_FirstStageWrapper(clone(self.model_Z_W, safe=False), discrete_target=self.discrete_instrument))
+
+
+class _ProjectedDMLATEIVModelNuisance:
+
+    def __init__(self, model_Y_W, model_T_W, model_T_WZ):
+        self._model_Y_W = clone(model_Y_W, safe=False)
+        self._model_T_W = clone(model_T_W, safe=False)
+        self._model_T_WZ = clone(model_T_WZ, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        assert X is None, "DML ATE IV does not accept features"
+        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_T_WZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        if hasattr(self._model_Y_W, 'score'):
+            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_W, 'score'):
+            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_T_WZ, 'score'):
+            T_XZ_score = self._model_T_WZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        return Y_X_score, T_X_score, T_XZ_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert X is None, "DML ATE IV does not accept features"
+        Y_pred = self._model_Y_W.predict(X, W)
+        TX_pred = self._model_T_W.predict(X, W)
+        TXZ_pred = self._model_T_WZ.predict(X, W, Z)
+        if W is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - TX_pred.reshape(T.shape)
+        Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+        return Y_res, T_res, Z_res
+
+
+class ProjectedDMLATEIV(_BaseDMLATEIV):
+
+    def __init__(self, *,
+                 model_Y_W,
+                 model_T_W,
+                 model_T_WZ,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_T_WZ = clone(model_T_WZ, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ProjectedDMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_T_WZ=_FirstStageWrapper(clone(self.model_T_WZ, safe=False),
+                                          discrete_target=self.discrete_treatment))
+
+
+class _BaseDMLIVModelNuisance:
+    """
+    Nuisance model fits the three models at fit time and at predict time
+    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+    """
+
+    def __init__(self, model_Y_X, model_T_X, model_T_XZ):
+        self._model_Y_X = clone(model_Y_X, safe=False)
+        self._model_T_X = clone(model_T_X, safe=False)
+        self._model_T_XZ = clone(model_T_XZ, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        # TODO: would it be useful to extend to handle controls ala vanilla DML?
+        assert W is None, "DML IV does not accept controls"
+        self._model_Y_X.fit(X=X, W=None, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_X.fit(X=X, W=None, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_T_XZ.fit(X=X, W=None, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert W is None, "DML IV does not accept controls"
+        if hasattr(self._model_Y_X, 'score'):
+            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_X, 'score'):
+            T_X_score = self._model_T_X.score(X=X, W=W, Target=T, sample_weight=sample_weight)
+        else:
+            T_X_score = None
+        if hasattr(self._model_T_XZ, 'score'):
+            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        return Y_X_score, T_X_score, T_XZ_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        assert W is None, "DML IV does not accept controls"
+        Y_pred = self._model_Y_X.predict(X, W)
+        TXZ_pred = self._model_T_XZ.predict(X, W, Z)
+        TX_pred = self._model_T_X.predict(X, W)
+        if X is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
+        return Y_res, T_res
+
+
+class _BaseDMLIVModelFinal:
+    """
+    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+    that depends on X, i.e.
+
+        .. math ::
+            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+    residual on residual regression.
+    """
+
+    def __init__(self, model_final):
+        self._model_final = clone(model_final, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res = nuisances
+        self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var)
+        return self
+
+    def predict(self, X=None):
+        return self._model_final.predict(X)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        Y_res, T_res = nuisances
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
+        else:
+            return np.mean((Y_res - Y_res_pred)**2)
+
+
+class _BaseDMLIV(_OrthoLearner):
+    """
+    The class _BaseDMLIV implements the base class of the DMLIV
+    algorithm for estimating a CATE. It accepts three generic machine
+    learning models:
+    1) model_Y_X that estimates :math:`\\E[Y | X]`
+    2) model_T_X that estimates :math:`\\E[T | X]`
+    3) model_T_XZ that estimates :math:`\\E[T | X, Z]`
+    These are estimated in a cross-fitting manner for each sample in the training set.
+    Then it minimizes the square loss:
+
+    .. math::
+        \\sum_i (Y_i - \\E[Y|X_i] - \theta(X) * (\\E[T|X_i, Z_i] - \\E[T|X_i]))^2
+
+    This loss is minimized by the model_final class, which is passed as an input.
+    In the two children classes {DMLIV, GenericDMLIV}, we implement different strategies of how to invoke
+    machine learning algorithms to minimize this final square loss.
+
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and `predict` methods
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit(X, Z, T, *, sample_weights)`
+        and `predict(X, Z)` methods.
+
+    model_final : estimator
+        final model that at fit time takes as input :math:`(Y-\\E[Y|X])`, :math:`(\\E[T|X,Z]-\\E[T|X])` and X
+        and supports method predict(X) that produces the CATE at X
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None, mc_agg='mean',
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    @_deprecate_positional("Z and X should be passed by keyword only. In a future release "
+                           "we will disallow passing Z and X by position.", ['X', 'Z'])
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDMLIV
+        """
+        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().fit(Y, T, X=X, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, X=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: optional(n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce W=None and improve the docstring
+        return super().score(Y, T, X=X, Z=Z)
+
+    @property
+    def original_featurizer(self):
+        return self.ortho_learner_model_final_._model_final._original_featurizer
+
+    @property
+    def featurizer_(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return self.ortho_learner_model_final_._model_final._featurizer
+
+    @property
+    def model_final_(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return self.ortho_learner_model_final_._model_final._model
+
+    @property
+    def model_cate(self):
+        """
+        Get the fitted final CATE model.
+
+        Returns
+        -------
+        model_cate: object of type(model_final)
+            An instance of the model_final object that was fitted after calling fit which corresponds
+            to the constant marginal CATE model.
+        """
+        return self.ortho_learner_model_final_._model_final._model
+
+    @property
+    def models_Y_X(self):
+        """
+        Get the fitted models for :math:`\\E[Y | X]`.
+
+        Returns
+        -------
+        models_Y_X: list of objects of type(`model_Y_X`)
+            A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_X(self):
+        """
+        Get the fitted models for :math:`\\E[T | X]`.
+
+        Returns
+        -------
+        models_T_X: list of objects of type(`model_T_X`)
+            A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_T_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_XZ(self):
+        """
+        Get the fitted models for :math:`\\E[T | X, Z]`.
+
+        Returns
+        -------
+        models_T_XZ: list of objects of type(`model_T_XZ`)
+            A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
+            fold and is the model instance that was fitted for that training fold.
+        """
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_Y_X(self):
+        """
+        Get the scores for Y_X model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_T_X(self):
+        """
+        Get the scores for T_X model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[1]
+
+    @property
+    def nuisance_scores_T_XZ(self):
+        """
+        Get the scores for T_XZ model on the out-of-sample training data
+        """
+        return self.nuisance_scores_[2]
+
+    def cate_feature_names(self, feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        feature_names: list of strings of length X.shape[1] or None
+            The names of the input features. If None and X is a dataframe, it defaults to the column names
+            from the dataframe.
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final constant marginal CATE model is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
+            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
+        """
+        if feature_names is None:
+            feature_names = self._input_names["feature_names"]
+        if self.original_featurizer is None:
+            return feature_names
+        elif hasattr(self.original_featurizer, 'get_feature_names'):
+            return self.original_featurizer.get_feature_names(feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+
+class DMLIV(LinearModelFinalCateEstimatorMixin, _BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that specifies a particular effect model
+    where the treatment effect is linear in some featurization of the variable X
+    The features are created by a provided featurizer that supports fit_transform.
+    Then an arbitrary model fits on the composite set of features.
+
+    Concretely, it assumes that :math:`\\theta(X)=<\\theta, \\phi(X)>` for some features :math:`\\phi(X)`
+    and runs a linear model regression of :math:`Y-\\E[Y|X]` on :math:`phi(X)*(\\E[T|X,Z]-\\E[T|X])`.
+    The features are created by the featurizer provided by the user. The particular
+    linear model regression is also specified by the user (e.g. Lasso, ElasticNet)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_final : estimator
+        final linear model for predicting :math:`(Y-\\E[Y|X])` from :math:`\\phi(X) \\cdot (\\E[T|X,Z]-\\E[T|X])`
+        Method is incorrect if this model is not linear (e.g. Lasso, ElasticNet, LinearRegression).
+
+    featurizer: :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_X,
+                 model_T_XZ,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 discrete_instrument=False, discrete_treatment=False,
+                 categories='auto', random_state=None):
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_T_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=False))
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
+
+
+class NonParamDMLIV(_BaseDMLIV):
+    """
+    A child of the _BaseDMLIV class that allows for an arbitrary square loss based ML
+    method in the final stage of the DMLIV algorithm. The method has to support
+    sample weights and the fit method has to take as input sample_weights (e.g. random forests), i.e.
+    fit(X, y, sample_weight=None)
+    It achieves this by re-writing the final stage square loss of the DMLIV algorithm as:
+
+    .. math ::
+        \\sum_i (\\E[T|X_i, Z_i] - \\E[T|X_i])^2 * ((Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i]) - \\theta(X))^2
+
+    Then this can be viewed as a weighted square loss regression, where the target label is
+
+    .. math ::
+        \\tilde{Y}_i = (Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i])
+
+    and each sample has a weight of
+
+    .. math ::
+        V(X_i) = (\\E[T|X_i, Z_i] - \\E[T|X_i])^2
+
+    Thus we can call any regression model with inputs:
+
+        fit(X, :math:`\\tilde{Y}_i`, sample_weight= :math:`V(X_i)`)
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_X : estimator
+        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
+        depending on whether the treatment is discrete.
+
+    model_final : estimator
+        final model for predicting :math:`\\tilde{Y}` from X with sample weights V(X)
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_X,
+                 model_T_XZ,
+                 model_final,
+                 featurizer=None,
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 random_state=None):
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_T_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        super().__init__(cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=False,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=True))
diff --git a/econml/iv/dr/__init__.py b/econml/iv/dr/__init__.py
new file mode 100644
index 00000000..24a8605a
--- /dev/null
+++ b/econml/iv/dr/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+from ._dr import IntentToTreatDRIV, LinearIntentToTreatDRIV
+
+__all__ = ["IntentToTreatDRIV",
+           "LinearIntentToTreatDRIV"]
diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py
new file mode 100644
index 00000000..aa87f34a
--- /dev/null
+++ b/econml/iv/dr/_dr.py
@@ -0,0 +1,795 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal IV for Heterogeneous Treatment Effects.
+
+A Double/Orthogonal machine learning approach to estimation of heterogeneous
+treatment effect with an endogenous treatment and an instrument. It
+implements the DMLIV and related algorithms from the paper:
+
+Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
+Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
+https://arxiv.org/abs/1905.10176
+
+"""
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+from ..._ortho_learner import _OrthoLearner
+from ..._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
+from ...inference import StatsModelsInference
+from ...sklearn_extensions.linear_model import StatsModelsLinearRegression
+from ...utilities import (_deprecate_positional, add_intercept, filter_none_kwargs,
+                          inverse_onehot)
+from .._nuisance_wrappers import _FirstStageWrapper, _FinalWrapper
+
+
+class _BaseDRIVModelFinal:
+    """
+    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+    that depends on X, i.e.
+
+        .. math ::
+            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
+
+    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+    residual on residual regression.
+    """
+
+    def __init__(self, model_final, featurizer,
+                 discrete_treatment, discrete_instrument,
+                 fit_cate_intercept, cov_clip, opt_reweighted):
+        self._model_final = clone(model_final, safe=False)
+        self._fit_cate_intercept = fit_cate_intercept
+        self._original_featurizer = clone(featurizer, safe=False)
+        self._discrete_treatment = discrete_treatment
+        self._discrete_instrument = discrete_instrument
+        if self._fit_cate_intercept:
+            add_intercept_trans = FunctionTransformer(add_intercept,
+                                                      validate=True)
+            if featurizer:
+                self._featurizer = Pipeline([('featurize', self._original_featurizer),
+                                             ('add_intercept', add_intercept_trans)])
+            else:
+                self._featurizer = add_intercept_trans
+        else:
+            self._featurizer = self._original_featurizer
+        self._cov_clip = cov_clip
+        self._opt_reweighted = opt_reweighted
+
+    def _effect_estimate(self, nuisances):
+        prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances]
+
+        # Estimate final model of theta(X) by minimizing the square loss:
+        # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2
+        # We clip the covariance so that it is bounded away from zero, so as to reduce variance
+        # at the expense of some small bias. For points with very small covariance we revert
+        # to the model-based preliminary estimate and do not add the correction term.
+        cov_sign = np.sign(cov)
+        cov_sign[cov_sign == 0] = 1
+        clipped_cov = cov_sign * np.clip(np.abs(cov),
+                                         self._cov_clip, np.inf)
+        return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        self.d_y = Y.shape[1:]
+        self.d_t = nuisances[1].shape[1:]
+        self.d_z = nuisances[3].shape[1:]
+
+        # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments,
+        #       instruments, and outcomes
+        if self.d_y and self.d_y[0] > 2:
+            raise AttributeError("DRIV only supports a single outcome")
+
+        if self.d_t and self.d_t[0] > 1:
+            if self._discrete_treatment:
+                raise AttributeError("DRIV only supports binary treatments")
+            else:
+                raise AttributeError("DRIV only supports single-dimensional continuous treatments")
+
+        if self.d_z and self.d_z[0] > 1:
+            if self._discrete_instrument:
+                raise AttributeError("DRIV only supports binary instruments")
+            else:
+                raise AttributeError("DRIV only supports single-dimensional continuous instruments")
+
+        theta_dr, clipped_cov = self._effect_estimate(nuisances)
+
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.fit_transform(X)
+        if self._opt_reweighted and (sample_weight is not None):
+            sample_weight = sample_weight * clipped_cov.ravel()**2
+        elif self._opt_reweighted:
+            sample_weight = clipped_cov.ravel()**2
+        self._model_final.fit(X, theta_dr, **filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var))
+
+        return self
+
+    def predict(self, X=None):
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+        return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
+        theta_dr, clipped_cov = self._effect_estimate(nuisances)
+
+        if (X is not None) and (self._featurizer is not None):
+            X = self._featurizer.transform(X)
+
+        if self._opt_reweighted and (sample_weight is not None):
+            sample_weight = sample_weight * clipped_cov.ravel()**2
+        elif self._opt_reweighted:
+            sample_weight = clipped_cov.ravel()**2
+
+        return np.average((theta_dr.ravel() - self._model_final.predict(X).ravel())**2,
+                          weights=sample_weight, axis=0)
+
+
+class _BaseDRIV(_OrthoLearner):
+
+    """
+    The _BaseDRIV algorithm for estimating CATE with IVs. It is the parent of the
+    two public classes {DRIV, ProjectedDRIV}
+
+    Parameters
+    ----------
+    nuisance_models : dictionary of nuisance models, with {'name_of_model' : EstimatorObject, ...}
+
+    model_final : estimator
+        model compatible with the sklearn regression API, used to fit the effect on X
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        model_final.fit must accept sample_weight as a kw argument. If True then
+        assumes the model_final is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_final space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+
+    discrete_instrument: bool, optional, default False
+        Whether the instrument values should be treated as categorical, rather than continuous, quantities
+
+    discrete_treatment: bool, optional, default False
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable, optional, default 2
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=0.1,
+                 opt_reweighted=False,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
+                 cv=2,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        self.cov_clip = cov_clip
+        self.opt_reweighted = opt_reweighted
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDRIVModelFinal(self._gen_model_final(),
+                                   clone(self.featurizer, safe=False),
+                                   self.discrete_treatment,
+                                   self.discrete_instrument,
+                                   self.fit_cate_intercept,
+                                   self.cov_clip,
+                                   self.opt_reweighted)
+
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _BaseDRIV instance
+        """
+        # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring
+        return super().fit(Y, T, X=X, W=W, Z=Z,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calls the score function of the model_final and returns the calculated score.
+        The model_final model must have a score method.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix or None (Default=None)
+            Instruments for each sample
+        X: optional (n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+
+        Returns
+        -------
+        score : float or (array of float)
+            The score of the final CATE model on the new data. Same type as the return
+            type of the model_final.score method.
+        """
+        # Replacing score from _OrthoLearner, to reorder arguments and improve the docstring
+        return super().score(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight)
+
+    @property
+    def original_featurizer(self):
+        return self.ortho_learner_model_final_._original_featurizer
+
+    @property
+    def featurizer_(self):
+        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
+        # for internal use by the library
+        return self.ortho_learner_model_final_._featurizer
+
+    @property
+    def model_final_(self):
+        # NOTE This is used by the inference methods and is more for internal use to the library
+        return self.ortho_learner_model_final_._model_final
+
+    def cate_feature_names(self, feature_names=None):
+        """
+        Get the output feature names.
+
+        Parameters
+        ----------
+        feature_names: list of strings of length X.shape[1] or None
+            The names of the input features. If None and X is a dataframe, it defaults to the column names
+            from the dataframe.
+
+        Returns
+        -------
+        out_feature_names: list of strings or None
+            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
+            final constant marginal CATE model is linear. It is the names of the features that are associated
+            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
+            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
+        """
+        if feature_names is None:
+            feature_names = self._input_names["feature_names"]
+        if self.original_featurizer is None:
+            return feature_names
+        elif hasattr(self.original_featurizer, 'get_feature_names'):
+            return self.original_featurizer.get_feature_names(feature_names)
+        else:
+            raise AttributeError("Featurizer does not have a method: get_feature_names!")
+
+
+class _IntentToTreatDRIVModelNuisance:
+    """
+    Nuisance model fits the three models at fit time and at predict time
+    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
+    """
+
+    def __init__(self, model_Y_X, model_T_XZ, prel_model_effect):
+        self._model_Y_X = clone(model_Y_X, safe=False)
+        self._model_T_XZ = clone(model_T_XZ, safe=False)
+        self._prel_model_effect = clone(prel_model_effect, safe=False)
+
+    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        self._model_Y_X.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_T_XZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
+        # we need to undo the one-hot encoding for calling effect,
+        # since it expects raw values
+        self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W,
+                                    sample_weight=sample_weight, groups=groups)
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        if hasattr(self._model_Y_X, 'score'):
+            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
+        else:
+            Y_X_score = None
+        if hasattr(self._model_T_XZ, 'score'):
+            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
+        else:
+            T_XZ_score = None
+        if hasattr(self._prel_model_effect, 'score'):
+            # we need to undo the one-hot encoding for calling effect,
+            # since it expects raw values
+            effect_score = self._prel_model_effect.score(Y, inverse_onehot(T),
+                                                         Z=inverse_onehot(Z), X=X, W=W, sample_weight=sample_weight)
+        else:
+            effect_score = None
+
+        return Y_X_score, T_XZ_score, effect_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
+        Y_pred = self._model_Y_X.predict(X, W)
+        T_pred_zero = self._model_T_XZ.predict(X, W, np.zeros(Z.shape))
+        T_pred_one = self._model_T_XZ.predict(X, W, np.ones(Z.shape))
+        delta = (T_pred_one - T_pred_zero) / 2
+        T_pred_mean = (T_pred_one + T_pred_zero) / 2
+        prel_theta = self._prel_model_effect.effect(X)
+        if X is None:  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - T_pred_mean.reshape(T.shape)
+        return prel_theta, T_res, Y_res, 2 * Z - 1, delta
+
+
+class _IntentToTreatDRIV(_BaseDRIV):
+    """
+    Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 prel_model_effect,
+                 model_final,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 opt_reweighted=False,
+                 categories='auto',
+                 random_state=None):
+        """
+        """
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.prel_model_effect = clone(prel_model_effect, safe=False)
+        # TODO: check that Y, T, Z do not have multiple columns
+        super().__init__(model_final=model_final,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         discrete_instrument=True,
+                         discrete_treatment=True,
+                         categories=categories,
+                         opt_reweighted=opt_reweighted,
+                         random_state=random_state)
+
+    def _gen_prel_model_effect(self):
+        return clone(self.prel_model_effect, safe=False)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False),
+                                                                  discrete_target=False),
+                                               _FirstStageWrapper(clone(self.model_T_XZ, safe=False),
+                                                                  discrete_target=True),
+                                               self._gen_prel_model_effect())
+
+
+class _DummyCATE:
+    """
+    A dummy cate effect model that always returns zero effect
+    """
+
+    def __init__(self):
+        return
+
+    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None, **kwargs):
+        return self
+
+    def effect(self, X):
+        if X is None:
+            return np.zeros(1)
+        return np.zeros(X.shape[0])
+
+
+class IntentToTreatDRIV(_IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    final_model_effect : estimator, optional
+        a final model for the CATE and projections. If None, then flexible_model_effect is also used as a final model
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    cv: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    opt_reweighted : bool, optional, default False
+        Whether to reweight the samples to minimize variance. If True then
+        final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
+        utilities can be used for any linear model to enable sample_weights). If True then
+        assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise,
+        it method will return a biased projection to the model_effect space, biased
+        to give more weight on parts of the feature space where the instrument is strong.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 flexible_model_effect,
+                 model_final=None,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 opt_reweighted=False,
+                 categories='auto',
+                 random_state=None):
+        self.flexible_model_effect = clone(flexible_model_effect, safe=False)
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         prel_model_effect=None,
+                         model_final=model_final,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         opt_reweighted=opt_reweighted,
+                         categories=categories,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        if self.model_final is None:
+            return clone(self.flexible_model_effect, safe=False)
+        return clone(self.model_final, safe=False)
+
+    def _gen_prel_model_effect(self):
+        return _IntentToTreatDRIV(model_Y_X=clone(self.model_Y_X, safe=False),
+                                  model_T_XZ=clone(self.model_T_XZ, safe=False),
+                                  prel_model_effect=_DummyCATE(),
+                                  model_final=clone(self.flexible_model_effect, safe=False),
+                                  cov_clip=1e-7,
+                                  cv=1,
+                                  opt_reweighted=True,
+                                  random_state=self.random_state)
+
+    @property
+    def models_Y_X(self):
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
+
+    @property
+    def models_T_XZ(self):
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_Y_X(self):
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_T_XZ(self):
+        return self.nuisance_scores_[1]
+
+    @property
+    def nuisance_scores_effect(self):
+        return self.nuisance_scores_[2]
+
+    @property
+    def prel_model_effect(self):
+        return self._gen_prel_model_effect()
+
+    @prel_model_effect.setter
+    def prel_model_effect(self, value):
+        if value is not None:
+            raise ValueError("Parameter `prel_model_effect` cannot be altered for this estimator.")
+
+
+class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
+    """
+    Implements the DRIV algorithm for the intent-to-treat A/B test setting
+
+    Parameters
+    ----------
+    model_Y_X : estimator
+        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
+
+    model_T_XZ : estimator
+        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
+
+    flexible_model_effect : estimator
+        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
+
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
+
+    cov_clip : float, optional, default 0.1
+        clipping of the covariate for regions with low "overlap", to reduce variance
+
+    cv: int, cross-validation generator or an iterable, optional, default 3
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_Y_X,
+                 model_T_XZ,
+                 flexible_model_effect,
+                 featurizer=None,
+                 fit_cate_intercept=True,
+                 cov_clip=.1,
+                 cv=3,
+                 n_splits='raise',
+                 mc_iters=None,
+                 mc_agg='mean',
+                 categories='auto',
+                 random_state=None):
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         flexible_model_effect=flexible_model_effect,
+                         featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
+                         model_final=None,
+                         cov_clip=cov_clip,
+                         cv=cv,
+                         n_splits=n_splits,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         opt_reweighted=False,
+                         categories=categories, random_state=random_state)
+
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
+
+    # override only so that we can update the docstring to indicate support for `StatsModelsInference`
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        Z: (n, d_z) matrix or vector of length n
+            Instruments for each sample
+        X: optional(n, d_x) matrix or None (Default=None)
+            Features for each sample
+        W: optional(n, d_w) matrix or None (Default=None)
+            Controls for each sample
+        sample_weight: optional(n,) vector or None (Default=None)
+            Weights for each samples
+        sample_var: optional(n,) vector or None (Default=None)
+            Sample variance for each sample
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: string,:class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
+            (or an instance of :class:`.StatsModelsInference`).
+
+        Returns
+        -------
+        self : instance
+        """
+        return super().fit(Y, T, Z=Z, X=X, W=W,
+                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values, inference=inference)
+
+    def refit_final(self, *, inference='auto'):
+        return super().refit_final(inference=inference)
+    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final_._fit_cate_intercept
+
+    @property
+    def fit_cate_intercept_(self):
+        return self.ortho_learner_model_final_._fit_cate_intercept
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, value):
+        if value is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator.")
+
+    @property
+    def opt_reweighted(self):
+        return False
+
+    @opt_reweighted.setter
+    def opt_reweighted(self, value):
+        if not (value is False):
+            raise ValueError("Parameter `value` cannot be altered from `False` for this estimator.")
diff --git a/econml/iv/nnet/__init__.py b/econml/iv/nnet/__init__.py
new file mode 100644
index 00000000..dd4f68a7
--- /dev/null
+++ b/econml/iv/nnet/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._deepiv import DeepIV
+
+__all__ = ["DeepIV"]
diff --git a/econml/iv/nnet/_deepiv.py b/econml/iv/nnet/_deepiv.py
new file mode 100644
index 00000000..6f5a8369
--- /dev/null
+++ b/econml/iv/nnet/_deepiv.py
@@ -0,0 +1,455 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Deep IV estimator and related components."""
+
+import numpy as np
+import keras
+from ..._cate_estimator import BaseCateEstimator
+from ...utilities import deprecated
+from keras import backend as K
+import keras.layers as L
+from keras.models import Model
+from econml.utilities import check_input_arrays, _deprecate_positional
+
+# TODO: make sure to use random seeds wherever necessary
+# TODO: make sure that the public API consistently uses "T" instead of "P" for the treatment
+
+# unfortunately with the Theano and Tensorflow backends,
+# the straightforward use of K.stop_gradient can cause an error
+# because the parameters of the intermediate layers are now disconnected from the loss;
+# therefore we add a pointless multiplication by 0 to the values in each of the variables in vs
+# so that those layers remain connected but with 0 gradient
+
+
+def _zero_grad(e, vs):
+    if K.backend() == 'cntk':
+        return K.stop_gradient(e)
+    else:
+        z = 0 * K.sum(K.concatenate([K.batch_flatten(v) for v in vs]))
+        return K.stop_gradient(e) + z
+
+
+def mog_model(n_components, d_x, d_t):
+    """
+    Create a mixture of Gaussians model with the specified number of components.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_x : int
+        The number of dimensions in the layer used as input
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes an input of dimension `d_t` and generates three outputs: pi, mu, and sigma
+
+    """
+    x = L.Input((d_x,))
+    pi = L.Dense(n_components, activation='softmax')(x)
+    mu = L.Reshape((n_components, d_t))(L.Dense(n_components * d_t)(x))
+    log_sig = L.Dense(n_components)(x)
+    sig = L.Lambda(K.exp)(log_sig)
+    return Model([x], [pi, mu, sig])
+
+
+def mog_loss_model(n_components, d_t):
+    """
+    Create a Keras model that computes the loss of a mixture of Gaussians model on data.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes as inputs pi, mu, sigma, and t and generates a single output containing the loss.
+
+    """
+    pi = L.Input((n_components,))
+    mu = L.Input((n_components, d_t))
+    sig = L.Input((n_components,))
+    t = L.Input((d_t,))
+
+    # || t - mu_i || ^2
+    d2 = L.Lambda(lambda d: K.sum(K.square(d), axis=-1),
+                  output_shape=(n_components,))(
+        L.Subtract()([L.RepeatVector(n_components)(t), mu])
+    )
+
+    # LL = C - log(sum(pi_i/sig^d * exp(-d2/(2*sig^2))))
+    # Use logsumexp for numeric stability:
+    # LL = C - log(sum(exp(-d2/(2*sig^2) + log(pi_i/sig^d))))
+    # TODO: does the numeric stability actually make any difference?
+    def make_logloss(d2, sig, pi):
+        return -K.logsumexp(-d2 / (2 * K.square(sig)) + K.log(pi / K.pow(sig, d_t)), axis=-1)
+
+    ll = L.Lambda(lambda dsp: make_logloss(*dsp), output_shape=(1,))([d2, sig, pi])
+
+    m = Model([pi, mu, sig, t], [ll])
+    return m
+
+
+def mog_sample_model(n_components, d_t):
+    """
+    Create a model that generates samples from a mixture of Gaussians.
+
+    Parameters
+    ----------
+    n_components : int
+        The number of components in the mixture model
+
+    d_t : int
+        The number of dimensions in the output
+
+    Returns
+    -------
+    A Keras model that takes as inputs pi, mu, and sigma, and generates a single output containing a sample.
+
+    """
+    pi = L.Input((n_components,))
+    mu = L.Input((n_components, d_t))
+    sig = L.Input((n_components,))
+
+    # CNTK backend can't randomize across batches and doesn't implement cumsum (at least as of June 2018,
+    # see Known Issues on https://docs.microsoft.com/en-us/cognitive-toolkit/Using-CNTK-with-Keras)
+    def sample(pi, mu, sig):
+        batch_size = K.shape(pi)[0]
+        if K.backend() == 'cntk':
+            # generate cumulative sum via matrix multiplication
+            cumsum = K.dot(pi, K.constant(np.triu(np.ones((n_components, n_components)))))
+        else:
+            cumsum = K.cumsum(pi, 1)
+        cumsum_shift = K.concatenate([K.zeros_like(cumsum[:, 0:1]), cumsum])[:, :-1]
+        if K.backend() == 'cntk':
+            import cntk as C
+            # Generate standard uniform values in shape (batch_size,1)
+            #   (since we can't use the dynamic batch_size with random.uniform in CNTK,
+            #    we use uniform_like instead with an input of an appropriate shape)
+            rndSmp = C.random.uniform_like(pi[:, 0:1])
+        else:
+            rndSmp = K.random_uniform((batch_size, 1))
+        cmp1 = K.less_equal(cumsum_shift, rndSmp)
+        cmp2 = K.less(rndSmp, cumsum)
+
+        # convert to floats and multiply to perform equivalent of logical AND
+        rndIndex = K.cast(cmp1, K.floatx()) * K.cast(cmp2, K.floatx())
+
+        if K.backend() == 'cntk':
+            # Generate standard normal values in shape (batch_size,1,d_t)
+            #   (since we can't use the dynamic batch_size with random.normal in CNTK,
+            #    we use normal_like instead with an input of an appropriate shape)
+            rndNorms = C.random.normal_like(mu[:, 0:1, :])  # K.random_normal((1,d_t))
+        else:
+            rndNorms = K.random_normal((batch_size, 1, d_t))
+
+        rndVec = mu + K.expand_dims(sig) * rndNorms
+
+        # exactly one entry should be nonzero for each b,d combination; use sum to select it
+        return K.sum(K.expand_dims(rndIndex) * rndVec, 1)
+
+    # prevent gradient from passing through sampling
+    samp = L.Lambda(lambda pms: _zero_grad(sample(*pms), pms), output_shape=(d_t,))
+    samp.trainable = False
+
+    return Model([pi, mu, sig], samp([pi, mu, sig]))
+
+
+# three options: biased or upper-bound loss require a single number of samples;
+#                unbiased can take different numbers for the network and its gradient
+def response_loss_model(h, p, d_z, d_x, d_y, samples=1, use_upper_bound=False, gradient_samples=0):
+    """
+    Create a Keras model that computes the loss of a response model on data.
+
+    Parameters
+    ----------
+    h : (tensor, tensor) -> Layer
+        Method for building a model of y given p and x
+
+    p : (tensor, tensor) -> Layer
+        Method for building a model of p given z and x
+
+    d_z : int
+        The number of dimensions in z
+
+    d_x :  int
+        Tbe number of dimensions in x
+
+    d_y : int
+        The number of dimensions in y
+
+    samples: int
+        The number of samples to use
+
+    use_upper_bound : bool
+        Whether to use an upper bound to the true loss
+        (equivalent to adding a regularization penalty on the variance of h)
+
+    gradient_samples : int
+        The number of separate additional samples to use when calculating the gradient.
+        This can only be nonzero if user_upper_bound is False, in which case the gradient of
+        the returned loss will be an unbiased estimate of the gradient of the true loss.
+
+    Returns
+    -------
+    A Keras model that takes as inputs z, x, and y and generates a single output containing the loss.
+
+    """
+    assert not(use_upper_bound and gradient_samples)
+
+    # sample: (() -> Layer, int) -> Layer
+    def sample(f, n):
+        assert n > 0
+        if n == 1:
+            return f()
+        else:
+            return L.average([f() for _ in range(n)])
+    z, x, y = [L.Input((d,)) for d in [d_z, d_x, d_y]]
+    if gradient_samples:
+        # we want to separately sample the gradient; we use stop_gradient to treat the sampled model as constant
+        # the overall computation ensures that we have an interpretable loss (y-h̅(p,x))²,
+        # but also that the gradient is -2(y-h̅(p,x))∇h̅(p,x) with *different* samples used for each average
+        diff = L.subtract([y, sample(lambda: h(p(z, x), x), samples)])
+        grad = sample(lambda: h(p(z, x), x), gradient_samples)
+
+        def make_expr(grad, diff):
+            return K.stop_gradient(diff) * (K.stop_gradient(diff + 2 * grad) - 2 * grad)
+        expr = L.Lambda(lambda args: make_expr(*args))([grad, diff])
+    elif use_upper_bound:
+        expr = sample(lambda: L.Lambda(K.square)(L.subtract([y, h(p(z, x), x)])), samples)
+    else:
+        expr = L.Lambda(K.square)(L.subtract([y, sample(lambda: h(p(z, x), x), samples)]))
+    return Model([z, x, y], [expr])
+
+
+class DeepIV(BaseCateEstimator):
+    """
+    The Deep IV Estimator (see http://proceedings.mlr.press/v70/hartford17a/hartford17a.pdf).
+
+    Parameters
+    ----------
+    n_components : int
+        Number of components in the mixture density network
+
+    m : (tensor, tensor) -> Layer
+        Method for building a Keras model that featurizes the z and x inputs
+
+    h : (tensor, tensor) -> Layer
+        Method for building a model of y given t and x
+
+    n_samples : int
+        The number of samples to use
+
+    use_upper_bound_loss : bool, optional
+        Whether to use an upper bound to the true loss
+        (equivalent to adding a regularization penalty on the variance of h).
+        Defaults to False.
+
+    n_gradient_samples : int, optional
+        The number of separate additional samples to use when calculating the gradient.
+        This can only be nonzero if user_upper_bound is False, in which case the gradient of
+        the returned loss will be an unbiased estimate of the gradient of the true loss.
+        Defaults to 0.
+
+    optimizer : string, optional
+        The optimizer to use. Defaults to "adam"
+
+    first_stage_options : dictionary, optional
+        The keyword arguments to pass to Keras's `fit` method when training the first stage model.
+        Defaults to `{"epochs": 100}`.
+
+    second_stage_options : dictionary, optional
+        The keyword arguments to pass to Keras's `fit` method when training the second stage model.
+        Defaults to `{"epochs": 100}`.
+
+    """
+
+    def __init__(self, *,
+                 n_components,
+                 m,
+                 h,
+                 n_samples, use_upper_bound_loss=False, n_gradient_samples=0,
+                 optimizer='adam',
+                 first_stage_options={"epochs": 100},
+                 second_stage_options={"epochs": 100}):
+        self._n_components = n_components
+        self._m = m
+        self._h = h
+        self._n_samples = n_samples
+        self._use_upper_bound_loss = use_upper_bound_loss
+        self._n_gradient_samples = n_gradient_samples
+        self._optimizer = optimizer
+        self._first_stage_options = first_stage_options
+        self._second_stage_options = second_stage_options
+        super().__init__()
+
+    @_deprecate_positional("X and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X and Z by position.", ['X', 'Z'])
+    @BaseCateEstimator._wrap_fit
+    def fit(self, Y, T, X, Z, *, inference=None):
+        """Estimate the counterfactual model from data.
+
+        That is, estimate functions τ(·, ·, ·), ∂τ(·, ·).
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n × dₜ) matrix or vector of length n
+            Treatments for each sample
+        X: (n × dₓ) matrix
+            Features for each sample
+        Z: (n × d_z) matrix
+            Instruments for each sample
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`)
+
+        Returns
+        -------
+        self
+
+        """
+        Y, T, X, Z = check_input_arrays(Y, T, X, Z)
+        assert 1 <= np.ndim(X) <= 2
+        assert 1 <= np.ndim(Z) <= 2
+        assert 1 <= np.ndim(T) <= 2
+        assert 1 <= np.ndim(Y) <= 2
+        assert np.shape(X)[0] == np.shape(Y)[0] == np.shape(T)[0] == np.shape(Z)[0]
+
+        # in case vectors were passed for Y or T, keep track of trailing dims for reshaping effect output
+
+        d_x, d_y, d_z, d_t = [np.shape(a)[1] if np.ndim(a) > 1 else 1 for a in [X, Y, Z, T]]
+        x_in, y_in, z_in, t_in = [L.Input((d,)) for d in [d_x, d_y, d_z, d_t]]
+        n_components = self._n_components
+
+        treatment_network = self._m(z_in, x_in)
+
+        # the dimensionality of the output of the network
+        # TODO: is there a more robust way to do this?
+        d_n = K.int_shape(treatment_network)[-1]
+
+        pi, mu, sig = mog_model(n_components, d_n, d_t)([treatment_network])
+
+        ll = mog_loss_model(n_components, d_t)([pi, mu, sig, t_in])
+
+        model = Model([z_in, x_in, t_in], [ll])
+        model.add_loss(L.Lambda(K.mean)(ll))
+        model.compile(self._optimizer)
+        # TODO: do we need to give the user more control over other arguments to fit?
+        model.fit([Z, X, T], [], **self._first_stage_options)
+
+        lm = response_loss_model(lambda t, x: self._h(t, x),
+                                 lambda z, x: Model([z_in, x_in],
+                                                    # subtle point: we need to build a new model each time,
+                                                    # because each model encapsulates its randomness
+                                                    [mog_sample_model(n_components, d_t)([pi, mu, sig])])([z, x]),
+                                 d_z, d_x, d_y,
+                                 self._n_samples, self._use_upper_bound_loss, self._n_gradient_samples)
+
+        rl = lm([z_in, x_in, y_in])
+        response_model = Model([z_in, x_in, y_in], [rl])
+        response_model.add_loss(L.Lambda(K.mean)(rl))
+        response_model.compile(self._optimizer)
+        # TODO: do we need to give the user more control over other arguments to fit?
+        response_model.fit([Z, X, Y], [], **self._second_stage_options)
+
+        self._effect_model = Model([t_in, x_in], [self._h(t_in, x_in)])
+
+        # TODO: it seems like we need to sum over the batch because we can only apply gradient to a scalar,
+        #       not a general tensor (because of how backprop works in every framework)
+        #       (alternatively, we could iterate through the batch in addition to iterating through the output,
+        #       but this seems annoying...)
+        #       Therefore, it's important that we use a batch size of 1 when we call predict with this model
+        def calc_grad(t, x):
+            h = self._h(t, x)
+            all_grads = K.concatenate([g
+                                       for i in range(d_y)
+                                       for g in K.gradients(K.sum(h[:, i]), [t])])
+            return K.reshape(all_grads, (-1, d_y, d_t))
+
+        self._marginal_effect_model = Model([t_in, x_in], L.Lambda(lambda tx: calc_grad(*tx))([t_in, x_in]))
+
+    def effect(self, X=None, T0=0, T1=1):
+        """
+        Calculate the heterogeneous treatment effect τ(·,·,·).
+
+        The effect is calculated between the two treatment points
+        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T0: (m × dₜ) matrix
+            Base treatments for each sample
+        T1: (m × dₜ) matrix
+            Target treatments for each sample
+        X: optional (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        τ: (m × d_y) matrix
+            Heterogeneous treatment effects on each outcome for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+        """
+        X, T0, T1 = check_input_arrays(X, T0, T1)
+        if np.ndim(T0) == 0:
+            T0 = np.repeat(T0, 1 if X is None else np.shape(X)[0])
+        if np.ndim(T1) == 0:
+            T1 = np.repeat(T1, 1 if X is None else np.shape(X)[0])
+        if X is None:
+            X = np.empty((np.shape(T0)[0], 0))
+        return (self._effect_model.predict([T1, X]) - self._effect_model.predict([T0, X])).reshape((-1,) + self._d_y)
+
+    def marginal_effect(self, T, X=None):
+        """
+        Calculate the marginal effect ∂τ(·, ·) around a base treatment point conditional on features.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: optional(m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        grad_tau: (m × d_y × dₜ) array
+            Heterogeneous marginal effects on each outcome for each sample
+            Note that when Y or T is a vector rather than a 2-dimensional array,
+            the corresponding singleton dimensions in the output will be collapsed
+            (e.g. if both are vectors, then the output of this method will also be a vector)
+        """
+        T, X = check_input_arrays(T, X)
+        # TODO: any way to get this to work on batches of arbitrary size?
+        return self._marginal_effect_model.predict([T, X], batch_size=1).reshape((-1,) + self._d_y + self._d_t)
+
+    def predict(self, T, X):
+        """Predict outcomes given treatment assignments and features.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        Y: (m × d_y) matrix
+            Outcomes for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+        """
+        T, X = check_input_arrays(T, X)
+        return self._effect_model.predict([T, X]).reshape((-1,) + self._d_y)
diff --git a/econml/iv/sieve/__init__.py b/econml/iv/sieve/__init__.py
new file mode 100644
index 00000000..4cc9d29c
--- /dev/null
+++ b/econml/iv/sieve/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._tsls import HermiteFeatures, DPolynomialFeatures, SieveTSLS
+
+__all__ = ["HermiteFeatures",
+           "DPolynomialFeatures",
+           "SieveTSLS"]
diff --git a/econml/iv/sieve/_tsls.py b/econml/iv/sieve/_tsls.py
new file mode 100644
index 00000000..7f266967
--- /dev/null
+++ b/econml/iv/sieve/_tsls.py
@@ -0,0 +1,362 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Provides a non-parametric two-stage least squares instrumental variable estimator."""
+
+import numpy as np
+from copy import deepcopy
+from sklearn import clone
+from sklearn.linear_model import LinearRegression
+from ...utilities import (shape, transpose, reshape, cross_product, ndim, size,
+                          _deprecate_positional, check_input_arrays)
+from ..._cate_estimator import BaseCateEstimator, LinearCateEstimator
+from numpy.polynomial.hermite_e import hermeval
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import PolynomialFeatures
+from itertools import product
+
+
+class HermiteFeatures(TransformerMixin):
+    """
+    Featurizer that returns(unscaled) Hermite function evaluations.
+
+    The evaluated functions are of degrees 0..`degree`, differentiated `shift` times.
+
+    If the input has shape(n, x) and `joint` is False, the output will have shape(n, (`degree`+ 1)×x) if `shift` is 0.
+    If the input has shape(n, x) and `joint` is True, the output will have shape(n, (`degree`+ 1) ^ x) if `shift` is 0.
+    In either case, if `shift` is nonzero there will be `shift` additional dimensions of size x
+    between the first and last.
+    """
+
+    def __init__(self, degree, shift=0, joint=False):
+        self._degree = degree
+        self._shift = shift
+        self._joint = joint
+
+    def _column_feats(self, X, shift):
+        """
+        Apply Hermite function evaluations of degrees 0..`degree` differentiated `shift` times.
+
+        When applied to the column `X` of shape(n,), the resulting array has shape(n, (degree + 1)).
+        """
+        assert ndim(X) == 1
+        # this will have dimension (d,) + shape(X)
+        coeffs = np.identity(self._degree + shift + 1)[:, shift:]
+        feats = ((-1) ** shift) * hermeval(X, coeffs) * np.exp(-X * X / 2)
+        # send the first dimension to the end
+        return transpose(feats)
+
+    def fit(self, X):
+        """Fits the data(a NOP for this class) and returns self."""
+        return self
+
+    def transform(self, X):
+        """
+        Transform the data by applying the appropriate Hermite functions.
+
+        Parameters
+        ----------
+        X: array_like
+            2-dimensional array of input features
+
+        Returns
+        -------
+        The transformed data
+        """
+        assert ndim(X) == 2
+        n = shape(X)[0]
+        ncols = shape(X)[1]
+        columns = []
+        for indices in product(*[range(ncols) for i in range(self._shift)]):
+            if self._joint:
+                columns.append(cross_product(*[self._column_feats(X[:, i], indices.count(i))
+                                               for i in range(shape(X)[1])]))
+            else:
+                indices = set(indices)
+                if self._shift == 0:  # return features for all columns:
+                    columns.append(np.hstack([self._column_feats(X[:, i], self._shift) for i in range(shape(X)[1])]))
+                # columns are featurized independently; partial derivatives are only non-zero
+                # when taken with respect to the same column each time
+                elif len(indices) == 1:
+                    index = list(indices)[0]
+                    feats = self._column_feats(X[:, index], self._shift)
+                    columns.append(np.hstack([feats if i == index else np.zeros(shape(feats))
+                                              for i in range(shape(X)[1])]))
+                else:
+                    columns.append(np.zeros((n, (self._degree + 1) * ncols)))
+        return reshape(np.hstack(columns), (n,) + (ncols,) * self._shift + (-1,))
+
+
+class DPolynomialFeatures(TransformerMixin):
+    """
+    Featurizer that returns the derivatives of :class:`~sklearn.preprocessing.PolynomialFeatures` features in
+    a way that's compativle with the expectations of :class:`.NonparametricTwoStageLeastSquares`'s
+    `dt_featurizer` parameter.
+
+    If the input has shape `(n, x)` and
+    :meth:`PolynomialFeatures.transform<sklearn.preprocessing.PolynomialFeatures.transform>` returns an output
+    of shape `(n, f)`, then :meth:`.transform` will return an array of shape `(n, x, f)`.
+
+    Parameters
+    ----------
+    degree: integer, default = 2
+        The degree of the polynomial features.
+
+    interaction_only: boolean, default = False
+        If true, only derivatives of interaction features are produced: features that are products of at most degree
+        distinct input features (so not `x[1] ** 2`, `x[0] * x[2] ** 3`, etc.).
+
+    include_bias: boolean, default = True
+        If True (default), then include the derivative of a bias column, the feature in which all polynomial powers
+        are zero.
+    """
+
+    def __init__(self, degree=2, interaction_only=False, include_bias=True):
+        self.F = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=include_bias)
+
+    def fit(self, X, y=None):
+        """
+        Compute number of output features.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The data.
+        y : array, optional
+            Not used
+
+        Returns
+        -------
+        self : instance
+        """
+        return self
+
+    def transform(self, X):
+        """
+        Transform data to derivatives of polynomial features
+
+        Parameters
+        ----------
+        X: array-like, shape (n_samples, n_features)
+            The data to transform, row by row.
+
+        Returns
+        -------
+        XP: array-like, shape (n_samples, n_features, n_output_features)
+            The matrix of features, where `n_output_features` is the number of features that
+            would be returned from :class:`~sklearn.preprocessing.PolynomialFeatures`.
+        """
+        self.F.fit(X)
+        powers = self.F.powers_
+        result = np.zeros(X.shape + (self.F.n_output_features_,))
+        for i in range(X.shape[1]):
+            p = powers.copy()
+            c = powers[:, i]
+            p[:, i] -= 1
+            M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
+            result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
+        return result
+
+
+def _add_ones(arr):
+    """Add a column of ones to the front of an array."""
+    return np.hstack([np.ones((shape(arr)[0], 1)), arr])
+
+
+def _add_zeros(arr):
+    """Add a column of zeros to the front of an array."""
+    return np.hstack([np.zeros((shape(arr)[0], 1)), arr])
+
+
+class SieveTSLS(BaseCateEstimator):
+    """
+    Non-parametric instrumental variables estimator.
+
+    Supports the use of arbitrary featurizers for the features, treatments, and instruments.
+
+    Parameters
+    ----------
+    t_featurizer: transformer
+        Featurizer used to transform the treatments
+
+    x_featurizer: transformer
+        Featurizer used to transform the raw features
+
+    z_featurizer: transformer
+        Featurizer used to transform the instruments
+
+    dt_featurizer: transformer
+        Featurizer used to transform the treatments for the computation of the marginal effect.
+        This should produce a 3-dimensional array, containing the per-treatment derivative of
+        each transformed treatment. That is, given a treatment array of shape(n, dₜ),
+        the output should have shape(n, dₜ, fₜ), where fₜ is the number of columns produced by `t_featurizer`.
+
+    """
+
+    def __init__(self, *,
+                 t_featurizer,
+                 x_featurizer,
+                 z_featurizer,
+                 dt_featurizer):
+        self._t_featurizer = clone(t_featurizer, safe=False)
+        self._x_featurizer = clone(x_featurizer, safe=False)
+        self._z_featurizer = clone(z_featurizer, safe=False)
+        self._dt_featurizer = clone(dt_featurizer, safe=False)
+        # don't fit intercept; manually add column of ones to the data instead;
+        # this allows us to ignore the intercept when computing marginal effects
+        self._model_T = LinearRegression(fit_intercept=False)
+        self._model_Y = LinearRegression(fit_intercept=False)
+        super().__init__()
+
+    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
+                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
+    @BaseCateEstimator._wrap_fit
+    def fit(self, Y, T, X, W, Z, *, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates functions τ(·, ·, ·), ∂τ(·, ·).
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix
+            Outcomes for each sample
+        T: (n × dₜ) matrix
+            Treatments for each sample
+        X: optional(n × dₓ) matrix
+            Features for each sample
+        W: optional(n × d_w) matrix
+            Controls for each sample
+        Z: optional(n × d_z) matrix
+            Instruments for each sample
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`)
+
+        Returns
+        -------
+        self
+
+        """
+        Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z)
+        if X is None:
+            X = np.empty((shape(Y)[0], 0))
+        if W is None:
+            W = np.empty((shape(Y)[0], 0))
+        assert shape(Y)[0] == shape(T)[0] == shape(X)[0] == shape(W)[0] == shape(Z)[0]
+
+        # make T 2D if if was a vector
+        if ndim(T) == 1:
+            T = reshape(T, (-1, 1))
+
+        # store number of columns of W so that we can create correctly shaped zero array in effect and marginal effect
+        self._d_w = shape(W)[1]
+
+        # two stage approximation
+        # first, get basis expansions of T, X, and Z
+        ft_X = self._x_featurizer.fit_transform(X)
+        ft_Z = self._z_featurizer.fit_transform(Z)
+        ft_T = self._t_featurizer.fit_transform(T)
+        # TODO: is it right that the effective number of intruments is the
+        #       product of ft_X and ft_Z, not just ft_Z?
+        assert shape(ft_T)[1] <= shape(ft_X)[1] * shape(ft_Z)[1], ("There can be no more T features than the product "
+                                                                   "of the number of X and Z features; otherwise "
+                                                                   "there is not enough information to identify their "
+                                                                   "structure")
+
+        # regress T expansion on X,Z expansions concatenated with W
+        features = _add_ones(np.hstack([W, cross_product(ft_X, ft_Z)]))
+        self._model_T.fit(features, ft_T)
+        # predict ft_T from interacted ft_X, ft_Z
+        ft_T_hat = self._model_T.predict(features)
+        self._model_Y.fit(_add_ones(np.hstack([W, cross_product(ft_T_hat, ft_X)])), Y)
+
+    def effect(self, X=None, T0=0, T1=1):
+        """
+        Calculate the heterogeneous treatment effect τ(·,·,·).
+
+        The effect is calculated between the two treatment points
+        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T0: (m × dₜ) matrix or vector of length m
+            Base treatments for each sample
+        T1: (m × dₜ) matrix or vector of length m
+            Target treatments for each sample
+        X: optional (m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        τ: (m × d_y) matrix
+            Heterogeneous treatment effects on each outcome for each sample
+            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
+            singleton dimension will be collapsed (so this method will return a vector)
+
+        """
+        if ndim(T0) == 0:
+            T0 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T0)
+        if ndim(T1) == 0:
+            T1 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T1)
+        if ndim(T0) == 1:
+            T0 = reshape(T0, (-1, 1))
+        if ndim(T1) == 1:
+            T1 = reshape(T1, (-1, 1))
+        if X is None:
+            X = np.empty((shape(T0)[0], 0))
+        assert shape(T0) == shape(T1)
+        assert shape(T0)[0] == shape(X)[0]
+
+        W = np.zeros((shape(T0)[0], self._d_w))  # can set arbitrarily since values will cancel
+        ft_X = self._x_featurizer.transform(X)
+        ft_T0 = self._t_featurizer.transform(T0)
+        ft_T1 = self._t_featurizer.transform(T1)
+        Y0 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T0, ft_X)])))
+        Y1 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T1, ft_X)])))
+        return Y1 - Y0
+
+    def marginal_effect(self, T, X=None):
+        """
+        Calculate the heterogeneous marginal effect ∂τ(·, ·).
+
+        The marginal effect is calculated around a base treatment
+        point conditional on a vector of features on a set of m test samples {Tᵢ, Xᵢ}.
+
+        Parameters
+        ----------
+        T: (m × dₜ) matrix
+            Base treatments for each sample
+        X: optional(m × dₓ) matrix
+            Features for each sample
+
+        Returns
+        -------
+        grad_tau: (m × d_y × dₜ) array
+            Heterogeneous marginal effects on each outcome for each sample
+            Note that when Y or T is a vector rather than a 2-dimensional array,
+            the corresponding singleton dimensions in the output will be collapsed
+            (e.g. if both are vectors, then the output of this method will also be a vector)
+        """
+        if X is None:
+            X = np.empty((shape(T)[0], 0))
+        assert shape(T)[0] == shape(X)[0]
+
+        ft_X = self._x_featurizer.transform(X)
+        n = shape(T)[0]
+        dT = self._dt_featurizer.transform(T if ndim(T) == 2 else reshape(T, (-1, 1)))
+        W = np.zeros((size(T), self._d_w))
+        # dT should be an n×dₜ×fₜ array (but if T was a vector, or if there is only one feature,
+        # dT may be only 2-dimensional)
+        # promote dT to 3D if necessary (e.g. if T was a vector)
+        if ndim(dT) < 3:
+            dT = reshape(dT, (n, 1, shape(dT)[1]))
+
+        # reshape ft_X and dT to allow cross product (result has shape n×dₜ×fₜ×f_x)
+        features = reshape(ft_X, (n, 1, 1, -1)) * reshape(dT, shape(dT) + (1,))
+        features = transpose(features, [0, 1, 3, 2])  # swap last two dims to match cross_product
+        features = reshape(features, (size(T), -1))
+        output = self._model_Y.predict(_add_zeros(np.hstack([W, features])))
+        output = reshape(output, shape(T) + shape(output)[1:])
+        if ndim(output) == 3:
+            return transpose(output, (0, 2, 1))  # transpose trailing T and Y dims
+        else:
+            return output
diff --git a/econml/metalearners/__init__.py b/econml/metalearners/__init__.py
new file mode 100644
index 00000000..ed7735c1
--- /dev/null
+++ b/econml/metalearners/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ._metalearners import (TLearner, SLearner, XLearner, DomainAdaptationLearner)
+
+__all__ = ["TLearner",
+           "SLearner",
+           "XLearner",
+           "DomainAdaptationLearner"]
diff --git a/econml/metalearners.py b/econml/metalearners/_metalearners.py
similarity index 98%
rename from econml/metalearners.py
rename to econml/metalearners/_metalearners.py
index 8b2efe66..480176c6 100644
--- a/econml/metalearners.py
+++ b/econml/metalearners/_metalearners.py
@@ -9,15 +9,15 @@ For more details on these CATE methods, see `<https://arxiv.org/abs/1706.03461>`
 
 import numpy as np
 import warnings
-from ._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
+from .._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
 from sklearn import clone
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.utils import check_array, check_X_y
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
-from .utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
-                        inverse_onehot, transpose, _EncoderWrapper, _deprecate_positional)
-from ._shap import _shap_explain_model_cate
+from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
+                         inverse_onehot, transpose, _EncoderWrapper, _deprecate_positional)
+from .._shap import _shap_explain_model_cate
 
 
 class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
diff --git a/econml/orf/__init__.py b/econml/orf/__init__.py
new file mode 100644
index 00000000..96d5e4d9
--- /dev/null
+++ b/econml/orf/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+""" An implementation of Orthogonal Random Forests [orf]_ and special
+case python classes.
+
+References
+----------
+.. [orf] M. Oprescu, V. Syrgkanis and Z. S. Wu.
+    Orthogonal Random Forest for Causal Inference.
+    *Proceedings of the 36th International Conference on Machine Learning*, 2019.
+    URL http://proceedings.mlr.press/v97/oprescu19a.html.
+"""
+
+from ._ortho_forest import DMLOrthoForest, DROrthoForest
+
+__all__ = ["DMLOrthoForest",
+           "DROrthoForest"]
diff --git a/econml/_causal_tree.py b/econml/orf/_causal_tree.py
similarity index 100%
rename from econml/_causal_tree.py
rename to econml/orf/_causal_tree.py
diff --git a/econml/orf/_ortho_forest.py b/econml/orf/_ortho_forest.py
new file mode 100644
index 00000000..e79c78ad
--- /dev/null
+++ b/econml/orf/_ortho_forest.py
@@ -0,0 +1,1317 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Orthogonal Random Forest.
+
+Orthogonal Random Forest (ORF) is an algorithm for heterogenous treatment effect
+estimation. Orthogonal Random Forest combines orthogonalization,
+a technique that effectively removes the confounding effect in two-stage estimation,
+with generalized random forests, a flexible method for estimating treatment
+effect heterogeneity.
+
+This file consists of classes that implement the following variants of the ORF method:
+
+- The :class:`DMLOrthoForest`, a two-forest approach for learning continuous or discrete treatment effects
+  using kernel two stage estimation.
+
+- The :class:`DROrthoForest`, a two-forest approach for learning discrete treatment effects
+  using kernel two stage estimation.
+
+For more details on these methods, see our paper [Oprescu2019]_.
+"""
+
+import abc
+import inspect
+import numpy as np
+import warnings
+from joblib import Parallel, delayed
+from sklearn import clone
+from scipy.stats import norm
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LassoCV, Lasso, LinearRegression, LogisticRegression, \
+    LogisticRegressionCV, ElasticNet
+from sklearn.model_selection import KFold, StratifiedKFold
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PolynomialFeatures, FunctionTransformer
+from sklearn.utils import check_random_state, check_array, column_or_1d
+from ..sklearn_extensions.linear_model import WeightedLassoCVWrapper
+from .._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
+from ._causal_tree import CausalTree
+from ..inference import NormalInferenceResults
+from ..inference._inference import Inference
+from ..utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
+                         cross_product, inverse_onehot, _EncoderWrapper, check_input_arrays,
+                         _RegressionWrapper, deprecated)
+from sklearn.model_selection import check_cv
+# TODO: consider working around relying on sklearn implementation details
+from ..sklearn_extensions.model_selection import _cross_val_predict
+
+
+def _build_tree_in_parallel(tree, Y, T, X, W,
+                            nuisance_estimator, parameter_estimator, moment_and_mean_gradient_estimator):
+    # Create splits of causal tree
+    tree.create_splits(Y, T, X, W, nuisance_estimator, parameter_estimator, moment_and_mean_gradient_estimator)
+    return tree
+
+
+def _fit_weighted_pipeline(model_instance, X, y, sample_weight):
+    weights_error_msg = (
+        "Estimators of type {} do not accept weights. "
+        "Consider using the class WeightedModelWrapper from econml.utilities to build a weighted model."
+    )
+    expected_error_msg = "fit() got an unexpected keyword argument 'sample_weight'"
+    if not isinstance(model_instance, Pipeline):
+        try:
+            model_instance.fit(X, y, sample_weight=sample_weight)
+        except TypeError as e:
+            if expected_error_msg in str(e):
+                # Make sure the correct exception is being rethrown
+                raise TypeError(weights_error_msg.format(model_instance.__class__.__name__))
+            else:
+                raise e
+    else:
+        try:
+            last_step_name = model_instance.steps[-1][0]
+            model_instance.fit(X, y, **{"{0}__sample_weight".format(last_step_name): sample_weight})
+        except TypeError as e:
+            if expected_error_msg in str(e):
+                raise TypeError(weights_error_msg.format(model_instance.steps[-1][1].__class__.__name__))
+            else:
+                raise e
+
+
+def _cross_fit(model_instance, X, y, split_indices, sample_weight=None, predict_func_name='predict'):
+    model_instance1 = clone(model_instance, safe=False)
+    model_instance2 = clone(model_instance, safe=False)
+    split_1, split_2 = split_indices
+    predict_func1 = getattr(model_instance1, predict_func_name)
+    predict_func2 = getattr(model_instance2, predict_func_name)
+    if sample_weight is None:
+        model_instance2.fit(X[split_2], y[split_2])
+        pred_1 = predict_func2(X[split_1])
+        model_instance1.fit(X[split_1], y[split_1])
+        pred_2 = predict_func1(X[split_2])
+    else:
+        _fit_weighted_pipeline(model_instance2, X[split_2], y[split_2], sample_weight[split_2])
+        pred_1 = predict_func2(X[split_1])
+        _fit_weighted_pipeline(model_instance1, X[split_1], y[split_1], sample_weight[split_1])
+        pred_2 = predict_func1(X[split_2])
+    # Must make sure indices are merged correctly
+    sorted_split_indices = np.argsort(np.concatenate(split_indices), kind='mergesort')
+    return np.concatenate((pred_1, pred_2))[sorted_split_indices]
+
+
+def _group_predict(X, n_groups, predict_func):
+    """ Helper function that predicts using the predict function
+    for every input argument that looks like [X; i] for i in range(n_groups). Used in
+    DR moments, where we want to predict for each [X; t], for any value of the treatment t.
+    Returns an (X.shape[0], n_groups) matrix of predictions for each row of X and each t in range(n_groups).
+
+    Parameters
+    ----------
+    X : (n, m) array
+    n_groups : int
+    predict_func : fn
+
+    Returns
+    -------
+    pred : (n, n_groups) array
+    """
+    group_pred = np.zeros((X.shape[0], n_groups))
+    zero_t = np.zeros((X.shape[0], n_groups))
+    for i in range(n_groups):
+        zero_t[:, i] = 1
+        group_pred[:, i] = predict_func(np.concatenate((X, zero_t), axis=1))
+        zero_t[:, i] = 0
+    # Convert rows to columns
+    return group_pred
+
+
+def _group_cross_fit(model_instance, X, y, t, split_indices, sample_weight=None, predict_func_name='predict'):
+    # Require group assignment t to be one-hot-encoded
+    model_instance1 = clone(model_instance, safe=False)
+    model_instance2 = clone(model_instance, safe=False)
+    split_1, split_2 = split_indices
+    n_groups = t.shape[1]
+    predict_func1 = getattr(model_instance1, predict_func_name)
+    predict_func2 = getattr(model_instance2, predict_func_name)
+    Xt = np.concatenate((X, t), axis=1)
+    # Get predictions for the 2 splits
+    if sample_weight is None:
+        model_instance2.fit(Xt[split_2], y[split_2])
+        pred_1 = _group_predict(X[split_1], n_groups, predict_func2)
+        model_instance1.fit(Xt[split_1], y[split_1])
+        pred_2 = _group_predict(X[split_2], n_groups, predict_func1)
+    else:
+        _fit_weighted_pipeline(model_instance2, Xt[split_2], y[split_2], sample_weight[split_2])
+        pred_1 = _group_predict(X[split_1], n_groups, predict_func2)
+        _fit_weighted_pipeline(model_instance1, Xt[split_1], y[split_1], sample_weight[split_1])
+        pred_2 = _group_predict(X[split_2], n_groups, predict_func1)
+    # Must make sure indices are merged correctly
+    sorted_split_indices = np.argsort(np.concatenate(split_indices), kind='mergesort')
+    return np.concatenate((pred_1, pred_2))[sorted_split_indices]
+
+
+def _pointwise_effect(X_single, Y, T, X, W, w_nonzero, split_inds, slice_weights_list,
+                      second_stage_nuisance_estimator, second_stage_parameter_estimator,
+                      moment_and_mean_gradient_estimator, slice_len, n_slices, n_trees,
+                      stderr=False):
+    """Calculate the effect for a one data point with features X_single.
+
+    Parameters
+    ----------
+    X_single : array-like, shape (d_x, )
+        Feature vector that captures heterogeneity for one sample.
+
+    stderr : boolean (default=False)
+        Whether to calculate the covariance matrix via bootstrap-of-little-bags.
+    """
+    # Crossfitting
+    # Compute weighted nuisance estimates
+    nuisance_estimates = second_stage_nuisance_estimator(Y, T, X, W, w_nonzero, split_indices=split_inds)
+    parameter_estimate = second_stage_parameter_estimator(Y, T, X, nuisance_estimates, w_nonzero, X_single)
+    # -------------------------------------------------------------------------------
+    # Calculate the covariance matrix corresponding to the BLB inference
+    #
+    # 1. Calculate the moments and gradient of the training data w.r.t the test point
+    # 2. Calculate the weighted moments for each tree slice to create a matrix
+    #    U = (n_slices, n_T). The V = (U x grad^{-1}) matrix represents the deviation
+    #    in that slice from the overall parameter estimate.
+    # 3. Calculate the covariance matrix (V.T x V) / n_slices
+    # -------------------------------------------------------------------------------
+    if stderr:
+        moments, mean_grad = moment_and_mean_gradient_estimator(Y, T, X, W, nuisance_estimates,
+                                                                parameter_estimate)
+        # Calclulate covariance matrix through BLB
+        slice_weighted_moment_one = []
+        slice_weighted_moment_two = []
+        for slice_weights_one, slice_weights_two in slice_weights_list:
+            slice_weighted_moment_one.append(
+                np.average(moments[:len(split_inds[0])], axis=0, weights=slice_weights_one)
+            )
+            slice_weighted_moment_two.append(
+                np.average(moments[len(split_inds[0]):], axis=0, weights=slice_weights_two)
+            )
+        U = np.vstack(slice_weighted_moment_one + slice_weighted_moment_two)
+        inverse_grad = np.linalg.inv(mean_grad)
+        cov_mat = inverse_grad.T @ U.T @ U @ inverse_grad / (2 * n_slices)
+        return parameter_estimate, cov_mat
+    return parameter_estimate
+
+
+class BaseOrthoForest(TreatmentExpansionMixin, LinearCateEstimator):
+    """Base class for the :class:`DMLOrthoForest` and :class:`DROrthoForest`."""
+
+    def __init__(self,
+                 nuisance_estimator,
+                 second_stage_nuisance_estimator,
+                 parameter_estimator,
+                 second_stage_parameter_estimator,
+                 moment_and_mean_gradient_estimator,
+                 discrete_treatment=False,
+                 categories='auto',
+                 n_trees=500,
+                 min_leaf_size=10, max_depth=10,
+                 subsample_ratio=0.25,
+                 bootstrap=False,
+                 n_jobs=-1,
+                 backend='loky',
+                 verbose=3,
+                 batch_size='auto',
+                 random_state=None):
+        # Estimators
+        self.nuisance_estimator = nuisance_estimator
+        self.second_stage_nuisance_estimator = second_stage_nuisance_estimator
+        self.parameter_estimator = parameter_estimator
+        self.second_stage_parameter_estimator = second_stage_parameter_estimator
+        self.moment_and_mean_gradient_estimator = moment_and_mean_gradient_estimator
+        # OrthoForest parameters
+        self.n_trees = n_trees
+        self.min_leaf_size = min_leaf_size
+        self.max_depth = max_depth
+        self.bootstrap = bootstrap
+        self.subsample_ratio = subsample_ratio
+        self.n_jobs = n_jobs
+        self.random_state = check_random_state(random_state)
+        # Sub-forests
+        self.forest_one_trees = None
+        self.forest_two_trees = None
+        self.forest_one_subsample_ind = None
+        self.forest_two_subsample_ind = None
+        # Auxiliary attributes
+        self.n_slices = int(np.ceil((self.n_trees)**(1 / 2)))
+        self.slice_len = int(np.ceil(self.n_trees / self.n_slices))
+        # Fit check
+        self.model_is_fitted = False
+        self.discrete_treatment = discrete_treatment
+        self.backend = backend
+        self.verbose = verbose
+        self.batch_size = batch_size
+        super().__init__()
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    @BaseCateEstimator._wrap_fit
+    def fit(self, Y, T, X, W=None, *, inference='auto'):
+        """Build an orthogonal random forest from a training set (Y, T, X, W).
+
+        Parameters
+        ----------
+        Y : array-like, shape (n, )
+            Outcome for the treatment policy.
+
+        T : array-like, shape (n, d_t)
+            Treatment policy.
+
+        X : array-like, shape (n, d_x)
+            Feature vector that captures heterogeneity.
+
+        W : array-like, shape (n, d_w) or None (default=None)
+            High-dimensional controls.
+
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
+
+        Returns
+        -------
+        self: an instance of self.
+        """
+        Y, T, X, W = check_inputs(Y, T, X, W, multi_output_Y=False)
+        shuffled_inidces = self.random_state.permutation(X.shape[0])
+        n = X.shape[0] // 2
+        self.Y_one = Y[shuffled_inidces[:n]]
+        self.Y_two = Y[shuffled_inidces[n:]]
+        self.T_one = T[shuffled_inidces[:n]]
+        self.T_two = T[shuffled_inidces[n:]]
+        self.X_one = X[shuffled_inidces[:n]]
+        self.X_two = X[shuffled_inidces[n:]]
+        if W is not None:
+            self.W_one = W[shuffled_inidces[:n]]
+            self.W_two = W[shuffled_inidces[n:]]
+        else:
+            self.W_one = None
+            self.W_two = None
+        self.forest_one_subsample_ind, self.forest_one_trees = self._fit_forest(Y=self.Y_one,
+                                                                                T=self.T_one,
+                                                                                X=self.X_one,
+                                                                                W=self.W_one)
+        self.forest_two_subsample_ind, self.forest_two_trees = self._fit_forest(Y=self.Y_two,
+                                                                                T=self.T_two,
+                                                                                X=self.X_two,
+                                                                                W=self.W_two)
+        self.model_is_fitted = True
+        return self
+
+    def const_marginal_effect(self, X):
+        """Calculate the constant marginal CATE θ(·) conditional on a vector of features X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n, d_x)
+            Feature vector that captures heterogeneity.
+
+        Returns
+        -------
+        Theta : matrix , shape (n, d_t)
+            Constant marginal CATE of each treatment for each sample.
+        """
+        # TODO: Check performance
+        return np.asarray(self._predict(X))
+
+    def _predict(self, X, stderr=False):
+        if not self.model_is_fitted:
+            raise NotFittedError('This {0} instance is not fitted yet.'.format(self.__class__.__name__))
+        X = check_array(X)
+        results = Parallel(n_jobs=self.n_jobs, backend=self.backend,
+                           batch_size=self.batch_size, verbose=self.verbose)(
+            delayed(_pointwise_effect)(X_single, *self._pw_effect_inputs(X_single, stderr=stderr),
+                                       self.second_stage_nuisance_estimator, self.second_stage_parameter_estimator,
+                                       self.moment_and_mean_gradient_estimator, self.slice_len, self.n_slices,
+                                       self.n_trees,
+                                       stderr=stderr) for X_single in X)
+        return results
+
+    def _pw_effect_inputs(self, X_single, stderr=False):
+        w1, w2 = self._get_weights(X_single)
+        mask_w1 = (w1 != 0)
+        mask_w2 = (w2 != 0)
+        w1_nonzero = w1[mask_w1]
+        w2_nonzero = w2[mask_w2]
+        # Must normalize weights
+        w_nonzero = np.concatenate((w1_nonzero, w2_nonzero))
+        split_inds = (np.arange(len(w1_nonzero)), np.arange(len(w1_nonzero), len(w_nonzero)))
+        slice_weights_list = []
+        if stderr:
+            slices = [
+                (it * self.slice_len, min((it + 1) * self.slice_len, self.n_trees)) for it in range(self.n_slices)
+            ]
+            for slice_it in slices:
+                slice_weights_one, slice_weights_two = self._get_weights(X_single, tree_slice=slice_it)
+                slice_weights_list.append((slice_weights_one[mask_w1], slice_weights_two[mask_w2]))
+        W_none = self.W_one is None
+        return np.concatenate((self.Y_one[mask_w1], self.Y_two[mask_w2])),\
+            np.concatenate((self.T_one[mask_w1], self.T_two[mask_w2])),\
+            np.concatenate((self.X_one[mask_w1], self.X_two[mask_w2])),\
+            np.concatenate((self.W_one[mask_w1], self.W_two[mask_w2])
+                           ) if not W_none else None,\
+            w_nonzero,\
+            split_inds, slice_weights_list
+
+    def _get_inference_options(self):
+        # Override the CATE inference options
+        # Add blb inference to parent's options
+        options = super()._get_inference_options()
+        options.update(blb=BLBInference)
+        options.update(auto=BLBInference)
+        return options
+
+    def _fit_forest(self, Y, T, X, W=None):
+        # Generate subsample indices
+        subsample_ind = self._get_blb_indices(X)
+        # Build trees in parallel
+        trees = [CausalTree(self.min_leaf_size, self.max_depth, 1000, .4,
+                            check_random_state(self.random_state.randint(MAX_RAND_SEED)))
+                 for _ in range(len(subsample_ind))]
+        return subsample_ind, Parallel(n_jobs=self.n_jobs, backend=self.backend,
+                                       batch_size=self.batch_size, verbose=self.verbose, max_nbytes=None)(
+            delayed(_build_tree_in_parallel)(tree,
+                                             Y[s], T[s], X[s], W[s] if W is not None else None,
+                                             self.nuisance_estimator,
+                                             self.parameter_estimator,
+                                             self.moment_and_mean_gradient_estimator)
+            for s, tree in zip(subsample_ind, trees))
+
+    def _get_weights(self, X_single, tree_slice=None):
+        """Calculate weights for a single input feature vector over a subset of trees.
+
+        The subset of trees is defined by the `tree_slice` tuple (start, end).
+        The (start, end) tuple includes all trees from `start` to `end`-1.
+        """
+        w1 = np.zeros(self.Y_one.shape[0])
+        w2 = np.zeros(self.Y_two.shape[0])
+        if tree_slice is None:
+            tree_range = range(self.n_trees)
+        else:
+            tree_range = range(*tree_slice)
+        for t in tree_range:
+            leaf = self.forest_one_trees[t].find_split(X_single)
+            weight_indexes = self.forest_one_subsample_ind[t][leaf.est_sample_inds]
+            leaf_weight = 1 / len(leaf.est_sample_inds)
+            if self.bootstrap:
+                # Bootstraping has repetitions in tree sample
+                unique, counts = np.unique(weight_indexes, return_counts=True)
+                w1[unique] += leaf_weight * counts
+            else:
+                w1[weight_indexes] += leaf_weight
+        for t in tree_range:
+            leaf = self.forest_two_trees[t].find_split(X_single)
+            # Similar for `a` weights
+            weight_indexes = self.forest_two_subsample_ind[t][leaf.est_sample_inds]
+            leaf_weight = 1 / len(leaf.est_sample_inds)
+            if self.bootstrap:
+                # Bootstraping has repetitions in tree sample
+                unique, counts = np.unique(weight_indexes, return_counts=True)
+                w2[unique] += leaf_weight * counts
+            else:
+                w2[weight_indexes] += leaf_weight
+        return (w1 / len(tree_range), w2 / len(tree_range))
+
+    def _get_blb_indices(self, X):
+        """Get  data indices for every tree under the little bags split."""
+        # Define subsample size
+        subsample_size = X.shape[0] // 2
+        if not self.bootstrap:
+            if self.subsample_ratio > 1.0:
+                # Safety check
+                warnings.warn("The argument 'subsample_ratio' must be between 0.0 and 1.0, " +
+                              "however a value of {} was provided. The 'subsample_ratio' will be changed to 1.0.")
+                self.subsample_ratio = 1.0
+            subsample_size = int(self.subsample_ratio * subsample_size)
+        subsample_ind = []
+        # Draw points to create little bags
+        for it in range(self.n_slices):
+            half_sample_inds = self.random_state.choice(
+                X.shape[0], X.shape[0] // 2, replace=False)
+            for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_trees)):
+                subsample_ind.append(half_sample_inds[self.random_state.choice(
+                    X.shape[0] // 2, subsample_size, replace=self.bootstrap)])
+        return np.asarray(subsample_ind)
+
+
+class DMLOrthoForest(BaseOrthoForest):
+    """OrthoForest for continuous or discrete treatments using the DML residual on residual moment function.
+
+    A two-forest approach for learning heterogeneous treatment effects using
+    kernel two stage estimation.
+
+    Parameters
+    ----------
+    n_trees : integer, optional (default=500)
+        Number of causal estimators in the forest.
+
+    min_leaf_size : integer, optional (default=10)
+        The minimum number of samples in a leaf.
+
+    max_depth : integer, optional (default=10)
+        The maximum number of splits to be performed when expanding the tree.
+
+    subsample_ratio : float, optional (default=0.7)
+        The ratio of the total sample to be used when training a causal tree.
+        Values greater than 1.0 will be considered equal to 1.0.
+        Parameter is ignored when bootstrap=True.
+
+    bootstrap : boolean, optional (default=False)
+        Whether to use bootstrap subsampling.
+
+    lambda_reg : float, optional (default=0.01)
+        The regularization coefficient in the ell_2 penalty imposed on the
+        locally linear part of the second stage fit. This is not applied to
+        the local intercept, only to the coefficient of the linear component.
+
+    model_T : estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
+        The estimator for residualizing the continuous treatment at each leaf.
+        Must implement `fit` and `predict` methods.
+
+    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3)
+        The estimator for residualizing the outcome at each leaf. Must implement
+        `fit` and `predict` methods.
+
+    model_T_final : estimator, optional (default=None)
+        The estimator for residualizing the treatment at prediction time. Must implement
+        `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
+        value of `model_T` parameter.
+
+    model_Y_final : estimator, optional (default=None)
+        The estimator for residualizing the outcome at prediction time. Must implement
+        `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
+        value of `model_Y` parameter.
+
+    global_residualization : bool, optional (default=False)
+        Whether to perform a prior residualization of Y and T using the model_Y_final and model_T_final
+        estimators, or whether to perform locally weighted residualization at each target point.
+        Global residualization is computationally less intensive, but could lose some statistical
+        power, especially when W is not None.
+
+    global_res_cv : int, cross-validation generator or an iterable, optional (default=2)
+        The specification of the CV splitter to be used for cross-fitting, when constructing
+        the global residuals of Y and T.
+
+    discrete_treatment : bool, optional (default=False)
+        Whether the treatment should be treated as categorical. If True, then the treatment T is
+        one-hot-encoded and the model_T is treated as a classifier that must have a predict_proba
+        method.
+
+    categories : array like or 'auto', optional (default='auto')
+        A list of pre-specified treatment categories. If 'auto' then categories are automatically
+        recognized at fit time.
+
+    n_jobs : int, optional (default=-1)
+        The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
+        ``-1`` means using all processors. Since OrthoForest methods are
+        computationally heavy, it is recommended to set `n_jobs` to -1.
+
+    backend : 'threading' or 'loky', optional (default='loky')
+        What backend should be used for parallelization with the joblib library.
+
+    verbose : int, optional (default=3)
+        Verbosity level
+
+    batch_size : int or 'auto', optional (default='auto')
+        Batch_size of jobs for parallelism
+
+    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    """
+
+    def __init__(self, *,
+                 n_trees=500,
+                 min_leaf_size=10, max_depth=10,
+                 subsample_ratio=0.7,
+                 bootstrap=False,
+                 lambda_reg=0.01,
+                 model_T='auto',
+                 model_Y=WeightedLassoCVWrapper(cv=3),
+                 model_T_final=None,
+                 model_Y_final=None,
+                 global_residualization=False,
+                 global_res_cv=2,
+                 discrete_treatment=False,
+                 categories='auto',
+                 n_jobs=-1,
+                 backend='loky',
+                 verbose=3,
+                 batch_size='auto',
+                 random_state=None):
+        # Copy and/or define models
+        self.lambda_reg = lambda_reg
+        if model_T == 'auto':
+            if discrete_treatment:
+                model_T = LogisticRegressionCV(cv=3)
+            else:
+                model_T = WeightedLassoCVWrapper(cv=3)
+        self.model_T = model_T
+        self.model_Y = model_Y
+        self.model_T_final = model_T_final
+        self.model_Y_final = model_Y_final
+        if self.model_T_final is None:
+            self.model_T_final = clone(self.model_T, safe=False)
+        if self.model_Y_final is None:
+            self.model_Y_final = clone(self.model_Y, safe=False)
+        if discrete_treatment:
+            self.model_T = _RegressionWrapper(self.model_T)
+            self.model_T_final = _RegressionWrapper(self.model_T_final)
+        self.random_state = check_random_state(random_state)
+        self.global_residualization = global_residualization
+        self.global_res_cv = global_res_cv
+        # Define nuisance estimators
+        nuisance_estimator = _DMLOrthoForest_nuisance_estimator_generator(
+            self.model_T, self.model_Y, self.random_state, second_stage=False,
+            global_residualization=self.global_residualization, discrete_treatment=discrete_treatment)
+        second_stage_nuisance_estimator = _DMLOrthoForest_nuisance_estimator_generator(
+            self.model_T_final, self.model_Y_final, self.random_state, second_stage=True,
+            global_residualization=self.global_residualization, discrete_treatment=discrete_treatment)
+        # Define parameter estimators
+        parameter_estimator = _DMLOrthoForest_parameter_estimator_func
+        second_stage_parameter_estimator = _DMLOrthoForest_second_stage_parameter_estimator_gen(
+            self.lambda_reg)
+        # Define
+        moment_and_mean_gradient_estimator = _DMLOrthoForest_moment_and_mean_gradient_estimator_func
+        if discrete_treatment:
+            if categories != 'auto':
+                categories = [categories]  # OneHotEncoder expects a 2D array with features per column
+            self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        super().__init__(
+            nuisance_estimator,
+            second_stage_nuisance_estimator,
+            parameter_estimator,
+            second_stage_parameter_estimator,
+            moment_and_mean_gradient_estimator,
+            n_trees=n_trees,
+            min_leaf_size=min_leaf_size,
+            max_depth=max_depth,
+            subsample_ratio=subsample_ratio,
+            bootstrap=bootstrap,
+            n_jobs=n_jobs,
+            backend=backend,
+            verbose=verbose,
+            batch_size=batch_size,
+            discrete_treatment=discrete_treatment,
+            categories=categories,
+            random_state=self.random_state)
+
+    def _combine(self, X, W):
+        if X is None:
+            return W
+        if W is None:
+            return X
+        return np.hstack([X, W])
+
+    # Need to redefine fit here for auto inference to work due to a quirk in how
+    # wrap_fit is defined
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X, W=None, *, inference='auto'):
+        """Build an orthogonal random forest from a training set (Y, T, X, W).
+
+        Parameters
+        ----------
+        Y : array-like, shape (n, )
+            Outcome for the treatment policy.
+
+        T : array-like, shape (n, d_t)
+            Treatment policy.
+
+        X : array-like, shape (n, d_x)
+            Feature vector that captures heterogeneity.
+
+        W : array-like, shape (n, d_w) or None (default=None)
+            High-dimensional controls.
+
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
+
+        Returns
+        -------
+        self: an instance of self.
+        """
+        self._set_input_names(Y, T, X, set_flag=True)
+        Y, T, X, W = check_inputs(Y, T, X, W)
+        if self.discrete_treatment:
+            d_t_in = T.shape[1:]
+            T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
+            self._d_t = T.shape[1:]
+            self.transformer = FunctionTransformer(
+                func=_EncoderWrapper(self._one_hot_encoder).encode,
+                validate=False)
+
+        if self.global_residualization:
+            cv = check_cv(self.global_res_cv, y=T, classifier=self.discrete_treatment)
+            cv = list(cv.split(X=X, y=T))
+            Y = Y - _cross_val_predict(self.model_Y_final, self._combine(X, W), Y, cv=cv, safe=False).reshape(Y.shape)
+            T = T - _cross_val_predict(self.model_T_final, self._combine(X, W), T, cv=cv, safe=False).reshape(T.shape)
+
+        super().fit(Y, T, X=X, W=W, inference=inference)
+
+        # weirdness of wrap_fit. We need to store d_t_in. But because wrap_fit decorates the parent
+        # fit, we need to set explicitly d_t_in here after super fit is called.
+        if self.discrete_treatment:
+            self._d_t_in = d_t_in
+        return self
+
+    def const_marginal_effect(self, X):
+        X = check_array(X)
+        # Override to flatten output if T is flat
+        effects = super().const_marginal_effect(X=X)
+        return effects.reshape((-1,) + self._d_y + self._d_t)
+    const_marginal_effect.__doc__ = BaseOrthoForest.const_marginal_effect.__doc__
+
+
+class _DMLOrthoForest_nuisance_estimator_generator:
+    """Generate nuissance estimator given model inputs from the class."""
+
+    def __init__(self, model_T, model_Y, random_state=None, second_stage=True,
+                 global_residualization=False, discrete_treatment=False):
+        self.model_T = model_T
+        self.model_Y = model_Y
+        self.random_state = random_state
+        self.second_stage = second_stage
+        self.global_residualization = global_residualization
+        self.discrete_treatment = discrete_treatment
+
+    def __call__(self, Y, T, X, W, sample_weight=None, split_indices=None):
+        if self.global_residualization:
+            return 0
+        if self.discrete_treatment:
+            # Check that all discrete treatments are represented
+            if len(np.unique(T @ np.arange(1, T.shape[1] + 1))) < T.shape[1] + 1:
+                return None
+        # Nuissance estimates evaluated with cross-fitting
+        this_random_state = check_random_state(self.random_state)
+        if (split_indices is None) and self.second_stage:
+            if self.discrete_treatment:
+                # Define 2-fold iterator
+                kfold_it = StratifiedKFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X, T)
+                # Check if there is only one example of some class
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('error')
+                    try:
+                        split_indices = list(kfold_it)[0]
+                    except Warning as warn:
+                        msg = str(warn)
+                        if "The least populated class in y has only 1 members" in msg:
+                            return None
+            else:
+                # Define 2-fold iterator
+                kfold_it = KFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X)
+                split_indices = list(kfold_it)[0]
+        if W is not None:
+            X_tilde = np.concatenate((X, W), axis=1)
+        else:
+            X_tilde = X
+
+        try:
+            if self.second_stage:
+                T_hat = _cross_fit(self.model_T, X_tilde, T, split_indices, sample_weight=sample_weight)
+                Y_hat = _cross_fit(self.model_Y, X_tilde, Y, split_indices, sample_weight=sample_weight)
+            else:
+                # need safe=False when cloning for WeightedModelWrapper
+                T_hat = clone(self.model_T, safe=False).fit(X_tilde, T).predict(X_tilde)
+                Y_hat = clone(self.model_Y, safe=False).fit(X_tilde, Y).predict(X_tilde)
+        except ValueError as exc:
+            raise ValueError("The original error: {0}".format(str(exc)) +
+                             " This might be caused by too few sample in the tree leafs." +
+                             " Try increasing the min_leaf_size.")
+        return Y_hat, T_hat
+
+
+def _DMLOrthoForest_parameter_estimator_func(Y, T, X,
+                                             nuisance_estimates,
+                                             sample_weight=None):
+    """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates."""
+    # Compute residuals
+    Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
+    # Compute coefficient by OLS on residuals
+    param_estimate = LinearRegression(fit_intercept=False).fit(
+        T_res, Y_res, sample_weight=sample_weight
+    ).coef_
+    # Parameter returned by LinearRegression is (d_T, )
+    return param_estimate
+
+
+class _DMLOrthoForest_second_stage_parameter_estimator_gen:
+    """
+    For the second stage parameter estimation we add a local linear correction. So
+    we fit a local linear function as opposed to a local constant function. We also penalize
+    the linear part to reduce variance.
+    """
+
+    def __init__(self, lambda_reg):
+        self.lambda_reg = lambda_reg
+
+    def __call__(self, Y, T, X,
+                 nuisance_estimates,
+                 sample_weight,
+                 X_single):
+        """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates.
+
+        The parameter is calculated around the feature vector given by `X_single`. `X_single` can be used to do
+        local corrections on a preliminary parameter estimate.
+        """
+        # Compute residuals
+        Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
+        X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
+        XT_res = cross_product(T_res, X_aug)
+        # Compute coefficient by OLS on residuals
+        if sample_weight is not None:
+            weighted_XT_res = sample_weight.reshape(-1, 1) * XT_res
+        else:
+            weighted_XT_res = XT_res / XT_res.shape[0]
+        # ell_2 regularization
+        diagonal = np.ones(XT_res.shape[1])
+        diagonal[:T_res.shape[1]] = 0
+        reg = self.lambda_reg * np.diag(diagonal)
+        # Ridge regression estimate
+        linear_coef_estimate = np.linalg.lstsq(np.matmul(weighted_XT_res.T, XT_res) + reg,
+                                               np.matmul(weighted_XT_res.T, Y_res.reshape(-1, 1)),
+                                               rcond=None)[0].flatten()
+        X_aug = np.append([1], X_single)
+        linear_coef_estimate = linear_coef_estimate.reshape((X_aug.shape[0], -1)).T
+        # Parameter returned is of shape (d_T, )
+        return np.dot(linear_coef_estimate, X_aug)
+
+
+def _DMLOrthoForest_moment_and_mean_gradient_estimator_func(Y, T, X, W,
+                                                            nuisance_estimates,
+                                                            parameter_estimate):
+    """Calculate the moments and mean gradient at points given by (Y, T, X, W)."""
+    # Return moments and gradients
+    # Compute residuals
+    Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
+    # Compute moments
+    # Moments shape is (n, d_T)
+    moments = (Y_res - np.matmul(T_res, parameter_estimate)).reshape(-1, 1) * T_res
+    # Compute moment gradients
+    mean_gradient = - np.matmul(T_res.T, T_res) / T_res.shape[0]
+    return moments, mean_gradient
+
+
+def _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates):
+    if nuisance_estimates == 0:
+        return reshape_Y_T(Y, T)
+    # returns shape-conforming residuals
+    Y_hat, T_hat = reshape_Y_T(*nuisance_estimates)
+    Y, T = reshape_Y_T(Y, T)
+    Y_res, T_res = Y - Y_hat, T - T_hat
+    return Y_res, T_res
+
+
+class DROrthoForest(BaseOrthoForest):
+    """
+    OrthoForest for discrete treatments using the doubly robust moment function.
+
+    A two-forest approach for learning heterogeneous treatment effects using
+    kernel two stage estimation.
+
+    Parameters
+    ----------
+    n_trees : integer, optional (default=500)
+        Number of causal estimators in the forest.
+
+    min_leaf_size : integer, optional (default=10)
+        The minimum number of samples in a leaf.
+
+    max_depth : integer, optional (default=10)
+        The maximum number of splits to be performed when expanding the tree.
+
+    subsample_ratio : float, optional (default=0.7)
+        The ratio of the total sample to be used when training a causal tree.
+        Values greater than 1.0 will be considered equal to 1.0.
+        Parameter is ignored when bootstrap=True.
+
+    bootstrap : boolean, optional (default=False)
+        Whether to use bootstrap subsampling.
+
+    lambda_reg : float, optional (default=0.01)
+        The regularization coefficient in the ell_2 penalty imposed on the
+        locally linear part of the second stage fit. This is not applied to
+        the local intercept, only to the coefficient of the linear component.
+
+    propensity_model : estimator, optional (default=sklearn.linear_model.LogisticRegression(penalty='l1',\
+                                                                                             solver='saga',\
+                                                                                             multi_class='auto'))
+        Model for estimating propensity of treatment at each leaf.
+        Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
+
+    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
+        Estimator for learning potential outcomes at each leaf.
+        Will be trained on features, controls and one hot encoded treatments (concatenated).
+        If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
+        helper class. The model(s) must implement `fit` and `predict` methods.
+
+    propensity_model_final : estimator, optional (default=None)
+        Model for estimating propensity of treatment at at prediction time.
+        Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
+        If parameter is set to ``None``, it defaults to the value of `propensity_model` parameter.
+
+    model_Y_final : estimator, optional (default=None)
+        Estimator for learning potential outcomes at prediction time.
+        Will be trained on features, controls and one hot encoded treatments (concatenated).
+        If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
+        helper class. The model(s) must implement `fit` and `predict` methods.
+        If parameter is set to ``None``, it defaults to the value of `model_Y` parameter.
+
+    categories: 'auto' or list
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    n_jobs : int, optional (default=-1)
+        The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
+        ``-1`` means using all processors. Since OrthoForest methods are
+        computationally heavy, it is recommended to set `n_jobs` to -1.
+
+    backend : 'threading' or 'loky', optional (default='loky')
+        What backend should be used for parallelization with the joblib library.
+
+    verbose : int, optional (default=3)
+        Verbosity level
+
+    batch_size : int or 'auto', optional (default='auto')
+        Batch_size of jobs for parallelism
+
+    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+
+    """
+
+    def __init__(self, *,
+                 n_trees=500,
+                 min_leaf_size=10, max_depth=10,
+                 subsample_ratio=0.7,
+                 bootstrap=False,
+                 lambda_reg=0.01,
+                 propensity_model=LogisticRegression(penalty='l1', solver='saga',
+                                                     multi_class='auto'),  # saga solver supports l1
+                 model_Y=WeightedLassoCVWrapper(cv=3),
+                 propensity_model_final=None,
+                 model_Y_final=None,
+                 categories='auto',
+                 n_jobs=-1,
+                 backend='loky',
+                 verbose=3,
+                 batch_size='auto',
+                 random_state=None):
+        # Copy and/or define models
+        self.propensity_model = clone(propensity_model, safe=False)
+        self.model_Y = clone(model_Y, safe=False)
+        self.propensity_model_final = clone(propensity_model_final, safe=False)
+        self.model_Y_final = clone(model_Y_final, safe=False)
+        if self.propensity_model_final is None:
+            self.propensity_model_final = clone(self.propensity_model, safe=False)
+        if self.model_Y_final is None:
+            self.model_Y_final = clone(self.model_Y, safe=False)
+        self.random_state = check_random_state(random_state)
+
+        nuisance_estimator = DROrthoForest.nuisance_estimator_generator(
+            self.propensity_model, self.model_Y, self.random_state, second_stage=False)
+        second_stage_nuisance_estimator = DROrthoForest.nuisance_estimator_generator(
+            self.propensity_model_final, self.model_Y_final, self.random_state, second_stage=True)
+        # Define parameter estimators
+        parameter_estimator = DROrthoForest.parameter_estimator_func
+        second_stage_parameter_estimator = DROrthoForest.second_stage_parameter_estimator_gen(
+            lambda_reg)
+        # Define moment and mean gradient estimator
+        moment_and_mean_gradient_estimator = DROrthoForest.moment_and_mean_gradient_estimator_func
+        if categories != 'auto':
+            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
+        self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+
+        super().__init__(
+            nuisance_estimator,
+            second_stage_nuisance_estimator,
+            parameter_estimator,
+            second_stage_parameter_estimator,
+            moment_and_mean_gradient_estimator,
+            discrete_treatment=True,
+            categories=categories,
+            n_trees=n_trees,
+            min_leaf_size=min_leaf_size,
+            max_depth=max_depth,
+            subsample_ratio=subsample_ratio,
+            bootstrap=bootstrap,
+            n_jobs=n_jobs,
+            backend=backend,
+            verbose=verbose,
+            batch_size=batch_size,
+            random_state=self.random_state)
+
+    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
+                           "we will disallow passing X and W by position.", ['X', 'W'])
+    def fit(self, Y, T, X, W=None, *, inference='auto'):
+        """Build an orthogonal random forest from a training set (Y, T, X, W).
+
+        Parameters
+        ----------
+        Y : array-like, shape (n, )
+            Outcome for the treatment policy. Must be a vector.
+
+        T : array-like, shape (n, )
+            Discrete treatment policy vector. The treatment policy should be a set of consecutive integers
+            starting with `0`, where `0` denotes the control group. Otherwise, the treatment policies
+            will be ordered lexicographically, with the smallest value being considered the control group.
+
+        X : array-like, shape (n, d_x)
+            Feature vector that captures heterogeneity.
+
+        W : array-like, shape (n, d_w) or None (default=None)
+            High-dimensional controls.
+
+        inference: string, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
+
+        Returns
+        -------
+        self: an instance of self.
+        """
+        self._set_input_names(Y, T, X, set_flag=True)
+        Y, T, X, W = check_inputs(Y, T, X, W)
+        # Check that T is shape (n, )
+        # Check T is numeric
+        T = self._check_treatment(T)
+        d_t_in = T.shape[1:]
+        # Train label encoder
+        T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
+        self._d_t = T.shape[1:]
+        self.transformer = FunctionTransformer(
+            func=_EncoderWrapper(self._one_hot_encoder).encode,
+            validate=False)
+        # Call `fit` from parent class
+        super().fit(Y, T, X=X, W=W, inference=inference)
+
+        # weirdness of wrap_fit. We need to store d_t_in. But because wrap_fit decorates the parent
+        # fit, we need to set explicitly d_t_in here after super fit is called.
+        self._d_t_in = d_t_in
+        return self
+
+    def const_marginal_effect(self, X):
+        X = check_array(X)
+        # Override to flatten output if T is flat
+        effects = super().const_marginal_effect(X=X)
+        return effects.reshape((-1,) + self._d_y + self._d_t)
+    const_marginal_effect.__doc__ = BaseOrthoForest.const_marginal_effect.__doc__
+
+    @staticmethod
+    def nuisance_estimator_generator(propensity_model, model_Y, random_state=None, second_stage=False):
+        """Generate nuissance estimator given model inputs from the class."""
+        def nuisance_estimator(Y, T, X, W, sample_weight=None, split_indices=None):
+            # Expand one-hot encoding to include the zero treatment
+            ohe_T = np.hstack([np.all(1 - T, axis=1, keepdims=True), T])
+            # Test that T contains all treatments. If not, return None
+            T = ohe_T @ np.arange(ohe_T.shape[1])
+            if len(np.unique(T)) < ohe_T.shape[1]:
+                return None
+            # Nuissance estimates evaluated with cross-fitting
+            this_random_state = check_random_state(random_state)
+            if (split_indices is None) and second_stage:
+                # Define 2-fold iterator
+                kfold_it = StratifiedKFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X, T)
+                # Check if there is only one example of some class
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('error')
+                    try:
+                        split_indices = list(kfold_it)[0]
+                    except Warning as warn:
+                        msg = str(warn)
+                        if "The least populated class in y has only 1 members" in msg:
+                            return None
+            if W is not None:
+                X_tilde = np.concatenate((X, W), axis=1)
+            else:
+                X_tilde = X
+            try:
+                if not second_stage:
+                    # No need to crossfit for internal nodes
+                    propensity_model_clone = clone(propensity_model, safe=False)
+                    propensity_model_clone.fit(X_tilde, T)
+                    propensities = propensity_model_clone.predict_proba(X_tilde)
+                    Y_hat = _group_predict(X_tilde, ohe_T.shape[1],
+                                           clone(model_Y, safe=False).fit(np.hstack([X_tilde, ohe_T]), Y).predict)
+                else:
+                    propensities = _cross_fit(propensity_model, X_tilde, T, split_indices,
+                                              sample_weight=sample_weight, predict_func_name='predict_proba')
+                    Y_hat = _group_cross_fit(model_Y, X_tilde, Y, ohe_T, split_indices, sample_weight=sample_weight)
+            except ValueError as exc:
+                raise ValueError("The original error: {0}".format(str(exc)) +
+                                 " This might be caused by too few sample in the tree leafs." +
+                                 " Try increasing the min_leaf_size.")
+            return Y_hat, propensities
+        return nuisance_estimator
+
+    @staticmethod
+    def parameter_estimator_func(Y, T, X,
+                                 nuisance_estimates,
+                                 sample_weight=None):
+        """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates."""
+        # Compute partial moments
+        pointwise_params = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
+        param_estimate = np.average(pointwise_params, weights=sample_weight, axis=0)
+        # If any of the values in the parameter estimate is nan, return None
+        return param_estimate
+
+    @staticmethod
+    def second_stage_parameter_estimator_gen(lambda_reg):
+        """
+        For the second stage parameter estimation we add a local linear correction. So
+        we fit a local linear function as opposed to a local constant function. We also penalize
+        the linear part to reduce variance.
+        """
+        def parameter_estimator_func(Y, T, X,
+                                     nuisance_estimates,
+                                     sample_weight,
+                                     X_single):
+            """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates.
+
+            The parameter is calculated around the feature vector given by `X_single`. `X_single` can be used to do
+            local corrections on a preliminary parameter estimate.
+            """
+            # Compute partial moments
+            pointwise_params = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
+            X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
+            # Compute coefficient by OLS on residuals
+            if sample_weight is not None:
+                weighted_X_aug = sample_weight.reshape(-1, 1) * X_aug
+            else:
+                weighted_X_aug = X_aug / X_aug.shape[0]
+            # ell_2 regularization
+            diagonal = np.ones(X_aug.shape[1])
+            diagonal[0] = 0
+            reg = lambda_reg * np.diag(diagonal)
+            # Ridge regression estimate
+            linear_coef_estimate = np.linalg.lstsq(np.matmul(weighted_X_aug.T, X_aug) + reg,
+                                                   np.matmul(weighted_X_aug.T, pointwise_params),
+                                                   rcond=None)[0].flatten()
+            X_aug = np.append([1], X_single)
+            linear_coef_estimate = linear_coef_estimate.reshape((X_aug.shape[0], -1)).T
+            # Parameter returned is of shape (d_T, )
+            return np.dot(linear_coef_estimate, X_aug)
+
+        return parameter_estimator_func
+
+    @staticmethod
+    def moment_and_mean_gradient_estimator_func(Y, T, X, W,
+                                                nuisance_estimates,
+                                                parameter_estimate):
+        """Calculate the moments and mean gradient at points given by (Y, T, X, W)."""
+        # Return moments and gradients
+        # Compute partial moments
+        partial_moments = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
+        # Compute moments
+        # Moments shape is (n, d_T-1)
+        moments = partial_moments - parameter_estimate
+        # Compute moment gradients
+        n_T = nuisance_estimates[0].shape[1] - 1
+        mean_gradient = np.diag(np.ones(n_T) * (-1))
+        return moments, mean_gradient
+
+    @staticmethod
+    def _partial_moments(Y, T, nuisance_estimates):
+        Y_hat, propensities = nuisance_estimates
+        partial_moments = np.zeros((len(Y), Y_hat.shape[1] - 1))
+        T = T @ np.arange(1, T.shape[1] + 1)
+        mask_0 = (T == 0)
+        for i in range(0, Y_hat.shape[1] - 1):
+            # Need to calculate this in an elegant way for when propensity is 0
+            partial_moments[:, i] = Y_hat[:, i + 1] - Y_hat[:, 0]
+            mask_i = (T == (i + 1))
+            partial_moments[:, i][mask_i] += (Y - Y_hat[:, i + 1])[mask_i] / propensities[:, i + 1][mask_i]
+            partial_moments[:, i][mask_0] -= (Y - Y_hat[:, 0])[mask_0] / propensities[:, 0][mask_0]
+        return partial_moments
+
+    def _check_treatment(self, T):
+        try:
+            # This will flatten T
+            T = column_or_1d(T)
+        except Exception as exc:
+            raise ValueError("Expected array of shape ({n}, ), but got {T_shape}".format(n=len(T), T_shape=T.shape))
+        # Check that T is numeric
+        try:
+            T.astype(float)
+        except Exception as exc:
+            raise ValueError("Expected numeric array but got non-numeric types.")
+        return T
+
+
+class BLBInference(Inference):
+    """
+    Bootstrap-of-Little-Bags inference implementation for the OrthoForest classes.
+
+    This class can only be used for inference with any estimator derived from :class:`BaseOrthoForest`.
+
+    Parameters
+    ----------
+    estimator : :class:`BaseOrthoForest`
+        Estimator to perform inference on. Must be a child class of :class:`BaseOrthoForest`.
+    """
+
+    def fit(self, estimator, *args, **kwargs):
+        """
+        Fits the inference model.
+
+        This is called after the estimator's fit.
+        """
+        self._estimator = estimator
+        self._input_names = estimator._input_names
+        # Test whether the input estimator is supported
+        if not hasattr(self._estimator, "_predict"):
+            raise TypeError("Unsupported estimator of type {}.".format(self._estimator.__class__.__name__) +
+                            " Estimators must implement the '_predict' method with the correct signature.")
+        return self
+
+    def const_marginal_effect_interval(self, X=None, *, alpha=0.1):
+        """ Confidence intervals for the quantities :math:`\\theta(X)` produced
+        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
+        calling the fit method.
+
+        Parameters
+        ----------
+        X: optional (m, d_x) matrix or None (Default=None)
+            Features for each sample
+
+        alpha: optional float in [0, 1] (Default=0.1)
+            The overall level of confidence of the reported interval.
+            The alpha/2, 1-alpha/2 confidence interval is reported.
+
+        Returns
+        -------
+        lower, upper : tuple(type of :meth:`const_marginal_effect(X)<const_marginal_effect>` ,\
+                             type of :meth:`const_marginal_effect(X)<const_marginal_effect>` )
+            The lower and the upper bounds of the confidence interval for each quantity.
+        """
+        X = check_array(X)
+        params_and_cov = self._predict_wrapper(X)
+        # Calculate confidence intervals for the parameter (marginal effect)
+        lower = alpha / 2
+        upper = 1 - alpha / 2
+        param_lower = [param + np.apply_along_axis(lambda s: norm.ppf(lower, scale=s), 0, np.sqrt(np.diag(cov_mat)))
+                       for (param, cov_mat) in params_and_cov]
+        param_upper = [param + np.apply_along_axis(lambda s: norm.ppf(upper, scale=s), 0, np.sqrt(np.diag(cov_mat)))
+                       for (param, cov_mat) in params_and_cov]
+        param_lower, param_upper = np.asarray(param_lower), np.asarray(param_upper)
+        return param_lower.reshape((-1,) + self._estimator._d_y + self._estimator._d_t),\
+            param_upper.reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
+
+    def const_marginal_effect_inference(self, X=None):
+        """ Inference results for the quantities :math:`\\theta(X)` produced
+        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
+        calling the fit method.
+
+        Parameters
+        ----------
+        X: optional (m, d_x) matrix or None (Default=None)
+            Features for each sample
+
+        Returns
+        -------
+        InferenceResults: instance of :class:`~econml.inference.NormalInferenceResults`
+            The inference results instance contains prediction and prediction standard error and
+            can on demand calculate confidence interval, z statistic and p value. It can also output
+            a dataframe summary of these inference results.
+        """
+        X = check_array(X)
+        params, cov = zip(*(self._predict_wrapper(X)))
+        params = np.array(params).reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
+        stderr = np.sqrt(np.diagonal(np.array(cov), axis1=1, axis2=2))
+        stderr = stderr.reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
+        return NormalInferenceResults(d_t=self._estimator._d_t[0] if self._estimator._d_t else 1,
+                                      d_y=self._estimator._d_y[0] if self._estimator._d_y else 1,
+                                      pred=params, pred_stderr=stderr, inf_type='effect', **self._input_names)
+
+    def _effect_inference_helper(self, X, T0, T1):
+        X, T0, T1 = self._estimator._expand_treatments(*check_input_arrays(X, T0, T1))
+        dT = (T1 - T0) if T0.ndim == 2 else (T1 - T0).reshape(-1, 1)
+        params_and_cov = self._predict_wrapper(X)
+        # Calculate confidence intervals for the effect
+        # Calculate the effects
+        eff = np.asarray([np.dot(params_and_cov[i][0], dT[i]) for i in range(X.shape[0])])
+        # Calculate the standard deviations for the effects
+        scales = np.asarray([np.sqrt(dT[i] @ params_and_cov[i][1] @ dT[i]) for i in range(X.shape[0])])
+        return eff.reshape((-1,) + self._estimator._d_y), scales.reshape((-1,) + self._estimator._d_y)
+
+    def effect_interval(self, X=None, *, T0=0, T1=1, alpha=0.1):
+        """ Confidence intervals for the quantities :math:`\\tau(X, T0, T1)` produced
+        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
+        calling the fit method.
+
+        Parameters
+        ----------
+        X: optional (m, d_x) matrix
+            Features for each sample
+        T0: optional (m, d_t) matrix or vector of length m (Default=0)
+            Base treatments for each sample
+        T1: optional (m, d_t) matrix or vector of length m (Default=1)
+            Target treatments for each sample
+        alpha: optional float in [0, 1] (Default=0.1)
+            The overall level of confidence of the reported interval.
+            The alpha/2, 1-alpha/2 confidence interval is reported.
+
+        Returns
+        -------
+        lower, upper : tuple(type of :meth:`effect(X, T0, T1)<effect>`, type of :meth:`effect(X, T0, T1))<effect>` )
+            The lower and the upper bounds of the confidence interval for each quantity.
+        """
+        eff, scales = self._effect_inference_helper(X, T0, T1)
+        lower = alpha / 2
+        upper = 1 - alpha / 2
+        effect_lower = eff + np.apply_along_axis(lambda s: norm.ppf(lower, scale=s), 0, scales)
+        effect_upper = eff + np.apply_along_axis(lambda s: norm.ppf(upper, scale=s), 0, scales)
+        return effect_lower, effect_upper
+
+    def effect_inference(self, X=None, *, T0=0, T1=1):
+        """ Inference results for the quantities :math:`\\tau(X, T0, T1)` produced
+        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
+        calling the fit method.
+
+        Parameters
+        ----------
+        X: optional (m, d_x) matrix
+            Features for each sample
+        T0: optional (m, d_t) matrix or vector of length m (Default=0)
+            Base treatments for each sample
+        T1: optional (m, d_t) matrix or vector of length m (Default=1)
+            Target treatments for each sample
+
+        Returns
+        -------
+        InferenceResults: instance of :class:`~econml.inference.NormalInferenceResults`
+            The inference results instance contains prediction and prediction standard error and
+            can on demand calculate confidence interval, z statistic and p value. It can also output
+            a dataframe summary of these inference results.
+        """
+        eff, scales = self._effect_inference_helper(X, T0, T1)
+        return NormalInferenceResults(d_t=1, d_y=self._estimator._d_y[0] if self._estimator._d_y else 1,
+                                      pred=eff, pred_stderr=scales, inf_type='effect', **self._input_names)
+
+    def _predict_wrapper(self, X=None):
+        return self._estimator._predict(X, stderr=True)
+
+
+@deprecated("The ContinuousTreatmentOrthoForest class has been renamed to DMLOrthoForest; "
+            "an upcoming release will remove support for the old name")
+class ContinuousTreatmentOrthoForest(DMLOrthoForest):
+    pass
+
+
+@deprecated("The DiscreteTreatmentOrthoForest class has been renamed to DROrthoForest; "
+            "an upcoming release will remove support for the old name")
+class DiscreteTreatmentOrthoForest(DROrthoForest):
+    pass
diff --git a/econml/ortho_forest.py b/econml/ortho_forest.py
index c94be2a6..a64c735e 100644
--- a/econml/ortho_forest.py
+++ b/econml/ortho_forest.py
@@ -1,1316 +1,32 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Orthogonal Random Forest.
-
-Orthogonal Random Forest (ORF) is an algorithm for heterogenous treatment effect
-estimation. Orthogonal Random Forest combines orthogonalization,
-a technique that effectively removes the confounding effect in two-stage estimation,
-with generalized random forests, a flexible method for estimating treatment
-effect heterogeneity.
-
-This file consists of classes that implement the following variants of the ORF method:
-
-- The :class:`DMLOrthoForest`, a two-forest approach for learning continuous or discrete treatment effects
-  using kernel two stage estimation.
-
-- The :class:`DROrthoForest`, a two-forest approach for learning discrete treatment effects
-  using kernel two stage estimation.
-
-For more details on these methods, see our paper [Oprescu2019]_.
-"""
-
-import abc
-import inspect
-import numpy as np
-import warnings
-from joblib import Parallel, delayed
-from sklearn import clone
-from scipy.stats import norm
-from sklearn.exceptions import NotFittedError
-from sklearn.linear_model import LassoCV, Lasso, LinearRegression, LogisticRegression, \
-    LogisticRegressionCV, ElasticNet
-from sklearn.model_selection import KFold, StratifiedKFold
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder, LabelEncoder, PolynomialFeatures, FunctionTransformer
-from sklearn.utils import check_random_state, check_array, column_or_1d
-from .sklearn_extensions.linear_model import WeightedLassoCVWrapper
-from ._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentExpansionMixin
-from ._causal_tree import CausalTree
-from .inference import Inference, NormalInferenceResults
-from .utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
-                        cross_product, inverse_onehot, _EncoderWrapper, check_input_arrays,
-                        _RegressionWrapper, deprecated)
-from sklearn.model_selection import check_cv
-# TODO: consider working around relying on sklearn implementation details
-from .sklearn_extensions.model_selection import _cross_val_predict
-
-
-def _build_tree_in_parallel(tree, Y, T, X, W,
-                            nuisance_estimator, parameter_estimator, moment_and_mean_gradient_estimator):
-    # Create splits of causal tree
-    tree.create_splits(Y, T, X, W, nuisance_estimator, parameter_estimator, moment_and_mean_gradient_estimator)
-    return tree
-
-
-def _fit_weighted_pipeline(model_instance, X, y, sample_weight):
-    weights_error_msg = (
-        "Estimators of type {} do not accept weights. "
-        "Consider using the class WeightedModelWrapper from econml.utilities to build a weighted model."
-    )
-    expected_error_msg = "fit() got an unexpected keyword argument 'sample_weight'"
-    if not isinstance(model_instance, Pipeline):
-        try:
-            model_instance.fit(X, y, sample_weight=sample_weight)
-        except TypeError as e:
-            if expected_error_msg in str(e):
-                # Make sure the correct exception is being rethrown
-                raise TypeError(weights_error_msg.format(model_instance.__class__.__name__))
-            else:
-                raise e
-    else:
-        try:
-            last_step_name = model_instance.steps[-1][0]
-            model_instance.fit(X, y, **{"{0}__sample_weight".format(last_step_name): sample_weight})
-        except TypeError as e:
-            if expected_error_msg in str(e):
-                raise TypeError(weights_error_msg.format(model_instance.steps[-1][1].__class__.__name__))
-            else:
-                raise e
-
-
-def _cross_fit(model_instance, X, y, split_indices, sample_weight=None, predict_func_name='predict'):
-    model_instance1 = clone(model_instance, safe=False)
-    model_instance2 = clone(model_instance, safe=False)
-    split_1, split_2 = split_indices
-    predict_func1 = getattr(model_instance1, predict_func_name)
-    predict_func2 = getattr(model_instance2, predict_func_name)
-    if sample_weight is None:
-        model_instance2.fit(X[split_2], y[split_2])
-        pred_1 = predict_func2(X[split_1])
-        model_instance1.fit(X[split_1], y[split_1])
-        pred_2 = predict_func1(X[split_2])
-    else:
-        _fit_weighted_pipeline(model_instance2, X[split_2], y[split_2], sample_weight[split_2])
-        pred_1 = predict_func2(X[split_1])
-        _fit_weighted_pipeline(model_instance1, X[split_1], y[split_1], sample_weight[split_1])
-        pred_2 = predict_func1(X[split_2])
-    # Must make sure indices are merged correctly
-    sorted_split_indices = np.argsort(np.concatenate(split_indices), kind='mergesort')
-    return np.concatenate((pred_1, pred_2))[sorted_split_indices]
-
-
-def _group_predict(X, n_groups, predict_func):
-    """ Helper function that predicts using the predict function
-    for every input argument that looks like [X; i] for i in range(n_groups). Used in
-    DR moments, where we want to predict for each [X; t], for any value of the treatment t.
-    Returns an (X.shape[0], n_groups) matrix of predictions for each row of X and each t in range(n_groups).
-
-    Parameters
-    ----------
-    X : (n, m) array
-    n_groups : int
-    predict_func : fn
-
-    Returns
-    -------
-    pred : (n, n_groups) array
-    """
-    group_pred = np.zeros((X.shape[0], n_groups))
-    zero_t = np.zeros((X.shape[0], n_groups))
-    for i in range(n_groups):
-        zero_t[:, i] = 1
-        group_pred[:, i] = predict_func(np.concatenate((X, zero_t), axis=1))
-        zero_t[:, i] = 0
-    # Convert rows to columns
-    return group_pred
-
-
-def _group_cross_fit(model_instance, X, y, t, split_indices, sample_weight=None, predict_func_name='predict'):
-    # Require group assignment t to be one-hot-encoded
-    model_instance1 = clone(model_instance, safe=False)
-    model_instance2 = clone(model_instance, safe=False)
-    split_1, split_2 = split_indices
-    n_groups = t.shape[1]
-    predict_func1 = getattr(model_instance1, predict_func_name)
-    predict_func2 = getattr(model_instance2, predict_func_name)
-    Xt = np.concatenate((X, t), axis=1)
-    # Get predictions for the 2 splits
-    if sample_weight is None:
-        model_instance2.fit(Xt[split_2], y[split_2])
-        pred_1 = _group_predict(X[split_1], n_groups, predict_func2)
-        model_instance1.fit(Xt[split_1], y[split_1])
-        pred_2 = _group_predict(X[split_2], n_groups, predict_func1)
-    else:
-        _fit_weighted_pipeline(model_instance2, Xt[split_2], y[split_2], sample_weight[split_2])
-        pred_1 = _group_predict(X[split_1], n_groups, predict_func2)
-        _fit_weighted_pipeline(model_instance1, Xt[split_1], y[split_1], sample_weight[split_1])
-        pred_2 = _group_predict(X[split_2], n_groups, predict_func1)
-    # Must make sure indices are merged correctly
-    sorted_split_indices = np.argsort(np.concatenate(split_indices), kind='mergesort')
-    return np.concatenate((pred_1, pred_2))[sorted_split_indices]
-
-
-def _pointwise_effect(X_single, Y, T, X, W, w_nonzero, split_inds, slice_weights_list,
-                      second_stage_nuisance_estimator, second_stage_parameter_estimator,
-                      moment_and_mean_gradient_estimator, slice_len, n_slices, n_trees,
-                      stderr=False):
-    """Calculate the effect for a one data point with features X_single.
-
-    Parameters
-    ----------
-    X_single : array-like, shape (d_x, )
-        Feature vector that captures heterogeneity for one sample.
-
-    stderr : boolean (default=False)
-        Whether to calculate the covariance matrix via bootstrap-of-little-bags.
-    """
-    # Crossfitting
-    # Compute weighted nuisance estimates
-    nuisance_estimates = second_stage_nuisance_estimator(Y, T, X, W, w_nonzero, split_indices=split_inds)
-    parameter_estimate = second_stage_parameter_estimator(Y, T, X, nuisance_estimates, w_nonzero, X_single)
-    # -------------------------------------------------------------------------------
-    # Calculate the covariance matrix corresponding to the BLB inference
-    #
-    # 1. Calculate the moments and gradient of the training data w.r.t the test point
-    # 2. Calculate the weighted moments for each tree slice to create a matrix
-    #    U = (n_slices, n_T). The V = (U x grad^{-1}) matrix represents the deviation
-    #    in that slice from the overall parameter estimate.
-    # 3. Calculate the covariance matrix (V.T x V) / n_slices
-    # -------------------------------------------------------------------------------
-    if stderr:
-        moments, mean_grad = moment_and_mean_gradient_estimator(Y, T, X, W, nuisance_estimates,
-                                                                parameter_estimate)
-        # Calclulate covariance matrix through BLB
-        slice_weighted_moment_one = []
-        slice_weighted_moment_two = []
-        for slice_weights_one, slice_weights_two in slice_weights_list:
-            slice_weighted_moment_one.append(
-                np.average(moments[:len(split_inds[0])], axis=0, weights=slice_weights_one)
-            )
-            slice_weighted_moment_two.append(
-                np.average(moments[len(split_inds[0]):], axis=0, weights=slice_weights_two)
-            )
-        U = np.vstack(slice_weighted_moment_one + slice_weighted_moment_two)
-        inverse_grad = np.linalg.inv(mean_grad)
-        cov_mat = inverse_grad.T @ U.T @ U @ inverse_grad / (2 * n_slices)
-        return parameter_estimate, cov_mat
-    return parameter_estimate
-
-
-class BaseOrthoForest(TreatmentExpansionMixin, LinearCateEstimator):
-    """Base class for the :class:`DMLOrthoForest` and :class:`DROrthoForest`."""
-
-    def __init__(self,
-                 nuisance_estimator,
-                 second_stage_nuisance_estimator,
-                 parameter_estimator,
-                 second_stage_parameter_estimator,
-                 moment_and_mean_gradient_estimator,
-                 discrete_treatment=False,
-                 categories='auto',
-                 n_trees=500,
-                 min_leaf_size=10, max_depth=10,
-                 subsample_ratio=0.25,
-                 bootstrap=False,
-                 n_jobs=-1,
-                 backend='loky',
-                 verbose=3,
-                 batch_size='auto',
-                 random_state=None):
-        # Estimators
-        self.nuisance_estimator = nuisance_estimator
-        self.second_stage_nuisance_estimator = second_stage_nuisance_estimator
-        self.parameter_estimator = parameter_estimator
-        self.second_stage_parameter_estimator = second_stage_parameter_estimator
-        self.moment_and_mean_gradient_estimator = moment_and_mean_gradient_estimator
-        # OrthoForest parameters
-        self.n_trees = n_trees
-        self.min_leaf_size = min_leaf_size
-        self.max_depth = max_depth
-        self.bootstrap = bootstrap
-        self.subsample_ratio = subsample_ratio
-        self.n_jobs = n_jobs
-        self.random_state = check_random_state(random_state)
-        # Sub-forests
-        self.forest_one_trees = None
-        self.forest_two_trees = None
-        self.forest_one_subsample_ind = None
-        self.forest_two_subsample_ind = None
-        # Auxiliary attributes
-        self.n_slices = int(np.ceil((self.n_trees)**(1 / 2)))
-        self.slice_len = int(np.ceil(self.n_trees / self.n_slices))
-        # Fit check
-        self.model_is_fitted = False
-        self.discrete_treatment = discrete_treatment
-        self.backend = backend
-        self.verbose = verbose
-        self.batch_size = batch_size
-        super().__init__()
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    @BaseCateEstimator._wrap_fit
-    def fit(self, Y, T, X, W=None, *, inference='auto'):
-        """Build an orthogonal random forest from a training set (Y, T, X, W).
-
-        Parameters
-        ----------
-        Y : array-like, shape (n, )
-            Outcome for the treatment policy.
-
-        T : array-like, shape (n, d_t)
-            Treatment policy.
-
-        X : array-like, shape (n, d_x)
-            Feature vector that captures heterogeneity.
-
-        W : array-like, shape (n, d_w) or None (default=None)
-            High-dimensional controls.
-
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
-
-        Returns
-        -------
-        self: an instance of self.
-        """
-        Y, T, X, W = check_inputs(Y, T, X, W, multi_output_Y=False)
-        shuffled_inidces = self.random_state.permutation(X.shape[0])
-        n = X.shape[0] // 2
-        self.Y_one = Y[shuffled_inidces[:n]]
-        self.Y_two = Y[shuffled_inidces[n:]]
-        self.T_one = T[shuffled_inidces[:n]]
-        self.T_two = T[shuffled_inidces[n:]]
-        self.X_one = X[shuffled_inidces[:n]]
-        self.X_two = X[shuffled_inidces[n:]]
-        if W is not None:
-            self.W_one = W[shuffled_inidces[:n]]
-            self.W_two = W[shuffled_inidces[n:]]
-        else:
-            self.W_one = None
-            self.W_two = None
-        self.forest_one_subsample_ind, self.forest_one_trees = self._fit_forest(Y=self.Y_one,
-                                                                                T=self.T_one,
-                                                                                X=self.X_one,
-                                                                                W=self.W_one)
-        self.forest_two_subsample_ind, self.forest_two_trees = self._fit_forest(Y=self.Y_two,
-                                                                                T=self.T_two,
-                                                                                X=self.X_two,
-                                                                                W=self.W_two)
-        self.model_is_fitted = True
-        return self
-
-    def const_marginal_effect(self, X):
-        """Calculate the constant marginal CATE θ(·) conditional on a vector of features X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n, d_x)
-            Feature vector that captures heterogeneity.
-
-        Returns
-        -------
-        Theta : matrix , shape (n, d_t)
-            Constant marginal CATE of each treatment for each sample.
-        """
-        # TODO: Check performance
-        return np.asarray(self._predict(X))
-
-    def _predict(self, X, stderr=False):
-        if not self.model_is_fitted:
-            raise NotFittedError('This {0} instance is not fitted yet.'.format(self.__class__.__name__))
-        X = check_array(X)
-        results = Parallel(n_jobs=self.n_jobs, backend=self.backend,
-                           batch_size=self.batch_size, verbose=self.verbose)(
-            delayed(_pointwise_effect)(X_single, *self._pw_effect_inputs(X_single, stderr=stderr),
-                                       self.second_stage_nuisance_estimator, self.second_stage_parameter_estimator,
-                                       self.moment_and_mean_gradient_estimator, self.slice_len, self.n_slices,
-                                       self.n_trees,
-                                       stderr=stderr) for X_single in X)
-        return results
-
-    def _pw_effect_inputs(self, X_single, stderr=False):
-        w1, w2 = self._get_weights(X_single)
-        mask_w1 = (w1 != 0)
-        mask_w2 = (w2 != 0)
-        w1_nonzero = w1[mask_w1]
-        w2_nonzero = w2[mask_w2]
-        # Must normalize weights
-        w_nonzero = np.concatenate((w1_nonzero, w2_nonzero))
-        split_inds = (np.arange(len(w1_nonzero)), np.arange(len(w1_nonzero), len(w_nonzero)))
-        slice_weights_list = []
-        if stderr:
-            slices = [
-                (it * self.slice_len, min((it + 1) * self.slice_len, self.n_trees)) for it in range(self.n_slices)
-            ]
-            for slice_it in slices:
-                slice_weights_one, slice_weights_two = self._get_weights(X_single, tree_slice=slice_it)
-                slice_weights_list.append((slice_weights_one[mask_w1], slice_weights_two[mask_w2]))
-        W_none = self.W_one is None
-        return np.concatenate((self.Y_one[mask_w1], self.Y_two[mask_w2])),\
-            np.concatenate((self.T_one[mask_w1], self.T_two[mask_w2])),\
-            np.concatenate((self.X_one[mask_w1], self.X_two[mask_w2])),\
-            np.concatenate((self.W_one[mask_w1], self.W_two[mask_w2])
-                           ) if not W_none else None,\
-            w_nonzero,\
-            split_inds, slice_weights_list
-
-    def _get_inference_options(self):
-        # Override the CATE inference options
-        # Add blb inference to parent's options
-        options = super()._get_inference_options()
-        options.update(blb=BLBInference)
-        options.update(auto=BLBInference)
-        return options
-
-    def _fit_forest(self, Y, T, X, W=None):
-        # Generate subsample indices
-        subsample_ind = self._get_blb_indices(X)
-        # Build trees in parallel
-        trees = [CausalTree(self.min_leaf_size, self.max_depth, 1000, .4,
-                            check_random_state(self.random_state.randint(MAX_RAND_SEED)))
-                 for _ in range(len(subsample_ind))]
-        return subsample_ind, Parallel(n_jobs=self.n_jobs, backend=self.backend,
-                                       batch_size=self.batch_size, verbose=self.verbose, max_nbytes=None)(
-            delayed(_build_tree_in_parallel)(tree,
-                                             Y[s], T[s], X[s], W[s] if W is not None else None,
-                                             self.nuisance_estimator,
-                                             self.parameter_estimator,
-                                             self.moment_and_mean_gradient_estimator)
-            for s, tree in zip(subsample_ind, trees))
-
-    def _get_weights(self, X_single, tree_slice=None):
-        """Calculate weights for a single input feature vector over a subset of trees.
-
-        The subset of trees is defined by the `tree_slice` tuple (start, end).
-        The (start, end) tuple includes all trees from `start` to `end`-1.
-        """
-        w1 = np.zeros(self.Y_one.shape[0])
-        w2 = np.zeros(self.Y_two.shape[0])
-        if tree_slice is None:
-            tree_range = range(self.n_trees)
-        else:
-            tree_range = range(*tree_slice)
-        for t in tree_range:
-            leaf = self.forest_one_trees[t].find_split(X_single)
-            weight_indexes = self.forest_one_subsample_ind[t][leaf.est_sample_inds]
-            leaf_weight = 1 / len(leaf.est_sample_inds)
-            if self.bootstrap:
-                # Bootstraping has repetitions in tree sample
-                unique, counts = np.unique(weight_indexes, return_counts=True)
-                w1[unique] += leaf_weight * counts
-            else:
-                w1[weight_indexes] += leaf_weight
-        for t in tree_range:
-            leaf = self.forest_two_trees[t].find_split(X_single)
-            # Similar for `a` weights
-            weight_indexes = self.forest_two_subsample_ind[t][leaf.est_sample_inds]
-            leaf_weight = 1 / len(leaf.est_sample_inds)
-            if self.bootstrap:
-                # Bootstraping has repetitions in tree sample
-                unique, counts = np.unique(weight_indexes, return_counts=True)
-                w2[unique] += leaf_weight * counts
-            else:
-                w2[weight_indexes] += leaf_weight
-        return (w1 / len(tree_range), w2 / len(tree_range))
-
-    def _get_blb_indices(self, X):
-        """Get  data indices for every tree under the little bags split."""
-        # Define subsample size
-        subsample_size = X.shape[0] // 2
-        if not self.bootstrap:
-            if self.subsample_ratio > 1.0:
-                # Safety check
-                warnings.warn("The argument 'subsample_ratio' must be between 0.0 and 1.0, " +
-                              "however a value of {} was provided. The 'subsample_ratio' will be changed to 1.0.")
-                self.subsample_ratio = 1.0
-            subsample_size = int(self.subsample_ratio * subsample_size)
-        subsample_ind = []
-        # Draw points to create little bags
-        for it in range(self.n_slices):
-            half_sample_inds = self.random_state.choice(
-                X.shape[0], X.shape[0] // 2, replace=False)
-            for _ in np.arange(it * self.slice_len, min((it + 1) * self.slice_len, self.n_trees)):
-                subsample_ind.append(half_sample_inds[self.random_state.choice(
-                    X.shape[0] // 2, subsample_size, replace=self.bootstrap)])
-        return np.asarray(subsample_ind)
-
-
-class DMLOrthoForest(BaseOrthoForest):
-    """OrthoForest for continuous or discrete treatments using the DML residual on residual moment function.
-
-    A two-forest approach for learning heterogeneous treatment effects using
-    kernel two stage estimation.
-
-    Parameters
-    ----------
-    n_trees : integer, optional (default=500)
-        Number of causal estimators in the forest.
-
-    min_leaf_size : integer, optional (default=10)
-        The minimum number of samples in a leaf.
-
-    max_depth : integer, optional (default=10)
-        The maximum number of splits to be performed when expanding the tree.
-
-    subsample_ratio : float, optional (default=0.7)
-        The ratio of the total sample to be used when training a causal tree.
-        Values greater than 1.0 will be considered equal to 1.0.
-        Parameter is ignored when bootstrap=True.
-
-    bootstrap : boolean, optional (default=False)
-        Whether to use bootstrap subsampling.
-
-    lambda_reg : float, optional (default=0.01)
-        The regularization coefficient in the ell_2 penalty imposed on the
-        locally linear part of the second stage fit. This is not applied to
-        the local intercept, only to the coefficient of the linear component.
-
-    model_T : estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
-        The estimator for residualizing the continuous treatment at each leaf.
-        Must implement `fit` and `predict` methods.
-
-    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3)
-        The estimator for residualizing the outcome at each leaf. Must implement
-        `fit` and `predict` methods.
-
-    model_T_final : estimator, optional (default=None)
-        The estimator for residualizing the treatment at prediction time. Must implement
-        `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
-        value of `model_T` parameter.
-
-    model_Y_final : estimator, optional (default=None)
-        The estimator for residualizing the outcome at prediction time. Must implement
-        `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
-        value of `model_Y` parameter.
-
-    global_residualization : bool, optional (default=False)
-        Whether to perform a prior residualization of Y and T using the model_Y_final and model_T_final
-        estimators, or whether to perform locally weighted residualization at each target point.
-        Global residualization is computationally less intensive, but could lose some statistical
-        power, especially when W is not None.
-
-    global_res_cv : int, cross-validation generator or an iterable, optional (default=2)
-        The specification of the CV splitter to be used for cross-fitting, when constructing
-        the global residuals of Y and T.
-
-    discrete_treatment : bool, optional (default=False)
-        Whether the treatment should be treated as categorical. If True, then the treatment T is
-        one-hot-encoded and the model_T is treated as a classifier that must have a predict_proba
-        method.
-
-    categories : array like or 'auto', optional (default='auto')
-        A list of pre-specified treatment categories. If 'auto' then categories are automatically
-        recognized at fit time.
-
-    n_jobs : int, optional (default=-1)
-        The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
-        ``-1`` means using all processors. Since OrthoForest methods are
-        computationally heavy, it is recommended to set `n_jobs` to -1.
-
-    backend : 'threading' or 'loky', optional (default='loky')
-        What backend should be used for parallelization with the joblib library.
-
-    verbose : int, optional (default=3)
-        Verbosity level
-
-    batch_size : int or 'auto', optional (default='auto')
-        Batch_size of jobs for parallelism
-
-    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    """
-
-    def __init__(self, *,
-                 n_trees=500,
-                 min_leaf_size=10, max_depth=10,
-                 subsample_ratio=0.7,
-                 bootstrap=False,
-                 lambda_reg=0.01,
-                 model_T='auto',
-                 model_Y=WeightedLassoCVWrapper(cv=3),
-                 model_T_final=None,
-                 model_Y_final=None,
-                 global_residualization=False,
-                 global_res_cv=2,
-                 discrete_treatment=False,
-                 categories='auto',
-                 n_jobs=-1,
-                 backend='loky',
-                 verbose=3,
-                 batch_size='auto',
-                 random_state=None):
-        # Copy and/or define models
-        self.lambda_reg = lambda_reg
-        if model_T == 'auto':
-            if discrete_treatment:
-                model_T = LogisticRegressionCV(cv=3)
-            else:
-                model_T = WeightedLassoCVWrapper(cv=3)
-        self.model_T = model_T
-        self.model_Y = model_Y
-        self.model_T_final = model_T_final
-        self.model_Y_final = model_Y_final
-        if self.model_T_final is None:
-            self.model_T_final = clone(self.model_T, safe=False)
-        if self.model_Y_final is None:
-            self.model_Y_final = clone(self.model_Y, safe=False)
-        if discrete_treatment:
-            self.model_T = _RegressionWrapper(self.model_T)
-            self.model_T_final = _RegressionWrapper(self.model_T_final)
-        self.random_state = check_random_state(random_state)
-        self.global_residualization = global_residualization
-        self.global_res_cv = global_res_cv
-        # Define nuisance estimators
-        nuisance_estimator = _DMLOrthoForest_nuisance_estimator_generator(
-            self.model_T, self.model_Y, self.random_state, second_stage=False,
-            global_residualization=self.global_residualization, discrete_treatment=discrete_treatment)
-        second_stage_nuisance_estimator = _DMLOrthoForest_nuisance_estimator_generator(
-            self.model_T_final, self.model_Y_final, self.random_state, second_stage=True,
-            global_residualization=self.global_residualization, discrete_treatment=discrete_treatment)
-        # Define parameter estimators
-        parameter_estimator = _DMLOrthoForest_parameter_estimator_func
-        second_stage_parameter_estimator = _DMLOrthoForest_second_stage_parameter_estimator_gen(
-            self.lambda_reg)
-        # Define
-        moment_and_mean_gradient_estimator = _DMLOrthoForest_moment_and_mean_gradient_estimator_func
-        if discrete_treatment:
-            if categories != 'auto':
-                categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-            self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
-        super().__init__(
-            nuisance_estimator,
-            second_stage_nuisance_estimator,
-            parameter_estimator,
-            second_stage_parameter_estimator,
-            moment_and_mean_gradient_estimator,
-            n_trees=n_trees,
-            min_leaf_size=min_leaf_size,
-            max_depth=max_depth,
-            subsample_ratio=subsample_ratio,
-            bootstrap=bootstrap,
-            n_jobs=n_jobs,
-            backend=backend,
-            verbose=verbose,
-            batch_size=batch_size,
-            discrete_treatment=discrete_treatment,
-            categories=categories,
-            random_state=self.random_state)
-
-    def _combine(self, X, W):
-        if X is None:
-            return W
-        if W is None:
-            return X
-        return np.hstack([X, W])
-
-    # Need to redefine fit here for auto inference to work due to a quirk in how
-    # wrap_fit is defined
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X, W=None, *, inference='auto'):
-        """Build an orthogonal random forest from a training set (Y, T, X, W).
-
-        Parameters
-        ----------
-        Y : array-like, shape (n, )
-            Outcome for the treatment policy.
-
-        T : array-like, shape (n, d_t)
-            Treatment policy.
-
-        X : array-like, shape (n, d_x)
-            Feature vector that captures heterogeneity.
-
-        W : array-like, shape (n, d_w) or None (default=None)
-            High-dimensional controls.
-
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
-
-        Returns
-        -------
-        self: an instance of self.
-        """
-        self._set_input_names(Y, T, X, set_flag=True)
-        Y, T, X, W = check_inputs(Y, T, X, W)
-        if self.discrete_treatment:
-            d_t_in = T.shape[1:]
-            T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
-            self._d_t = T.shape[1:]
-            self.transformer = FunctionTransformer(
-                func=_EncoderWrapper(self._one_hot_encoder).encode,
-                validate=False)
-
-        if self.global_residualization:
-            cv = check_cv(self.global_res_cv, y=T, classifier=self.discrete_treatment)
-            cv = list(cv.split(X=X, y=T))
-            Y = Y - _cross_val_predict(self.model_Y_final, self._combine(X, W), Y, cv=cv, safe=False).reshape(Y.shape)
-            T = T - _cross_val_predict(self.model_T_final, self._combine(X, W), T, cv=cv, safe=False).reshape(T.shape)
-
-        super().fit(Y, T, X=X, W=W, inference=inference)
-
-        # weirdness of wrap_fit. We need to store d_t_in. But because wrap_fit decorates the parent
-        # fit, we need to set explicitly d_t_in here after super fit is called.
-        if self.discrete_treatment:
-            self._d_t_in = d_t_in
-        return self
-
-    def const_marginal_effect(self, X):
-        X = check_array(X)
-        # Override to flatten output if T is flat
-        effects = super().const_marginal_effect(X=X)
-        return effects.reshape((-1,) + self._d_y + self._d_t)
-    const_marginal_effect.__doc__ = BaseOrthoForest.const_marginal_effect.__doc__
-
-
-class _DMLOrthoForest_nuisance_estimator_generator:
-    """Generate nuissance estimator given model inputs from the class."""
-
-    def __init__(self, model_T, model_Y, random_state=None, second_stage=True,
-                 global_residualization=False, discrete_treatment=False):
-        self.model_T = model_T
-        self.model_Y = model_Y
-        self.random_state = random_state
-        self.second_stage = second_stage
-        self.global_residualization = global_residualization
-        self.discrete_treatment = discrete_treatment
-
-    def __call__(self, Y, T, X, W, sample_weight=None, split_indices=None):
-        if self.global_residualization:
-            return 0
-        if self.discrete_treatment:
-            # Check that all discrete treatments are represented
-            if len(np.unique(T @ np.arange(1, T.shape[1] + 1))) < T.shape[1] + 1:
-                return None
-        # Nuissance estimates evaluated with cross-fitting
-        this_random_state = check_random_state(self.random_state)
-        if (split_indices is None) and self.second_stage:
-            if self.discrete_treatment:
-                # Define 2-fold iterator
-                kfold_it = StratifiedKFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X, T)
-                # Check if there is only one example of some class
-                with warnings.catch_warnings():
-                    warnings.filterwarnings('error')
-                    try:
-                        split_indices = list(kfold_it)[0]
-                    except Warning as warn:
-                        msg = str(warn)
-                        if "The least populated class in y has only 1 members" in msg:
-                            return None
-            else:
-                # Define 2-fold iterator
-                kfold_it = KFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X)
-                split_indices = list(kfold_it)[0]
-        if W is not None:
-            X_tilde = np.concatenate((X, W), axis=1)
-        else:
-            X_tilde = X
-
-        try:
-            if self.second_stage:
-                T_hat = _cross_fit(self.model_T, X_tilde, T, split_indices, sample_weight=sample_weight)
-                Y_hat = _cross_fit(self.model_Y, X_tilde, Y, split_indices, sample_weight=sample_weight)
-            else:
-                # need safe=False when cloning for WeightedModelWrapper
-                T_hat = clone(self.model_T, safe=False).fit(X_tilde, T).predict(X_tilde)
-                Y_hat = clone(self.model_Y, safe=False).fit(X_tilde, Y).predict(X_tilde)
-        except ValueError as exc:
-            raise ValueError("The original error: {0}".format(str(exc)) +
-                             " This might be caused by too few sample in the tree leafs." +
-                             " Try increasing the min_leaf_size.")
-        return Y_hat, T_hat
-
-
-def _DMLOrthoForest_parameter_estimator_func(Y, T, X,
-                                             nuisance_estimates,
-                                             sample_weight=None):
-    """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates."""
-    # Compute residuals
-    Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
-    # Compute coefficient by OLS on residuals
-    param_estimate = LinearRegression(fit_intercept=False).fit(
-        T_res, Y_res, sample_weight=sample_weight
-    ).coef_
-    # Parameter returned by LinearRegression is (d_T, )
-    return param_estimate
-
-
-class _DMLOrthoForest_second_stage_parameter_estimator_gen:
-    """
-    For the second stage parameter estimation we add a local linear correction. So
-    we fit a local linear function as opposed to a local constant function. We also penalize
-    the linear part to reduce variance.
-    """
-
-    def __init__(self, lambda_reg):
-        self.lambda_reg = lambda_reg
-
-    def __call__(self, Y, T, X,
-                 nuisance_estimates,
-                 sample_weight,
-                 X_single):
-        """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates.
-
-        The parameter is calculated around the feature vector given by `X_single`. `X_single` can be used to do
-        local corrections on a preliminary parameter estimate.
-        """
-        # Compute residuals
-        Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
-        X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
-        XT_res = cross_product(T_res, X_aug)
-        # Compute coefficient by OLS on residuals
-        if sample_weight is not None:
-            weighted_XT_res = sample_weight.reshape(-1, 1) * XT_res
-        else:
-            weighted_XT_res = XT_res / XT_res.shape[0]
-        # ell_2 regularization
-        diagonal = np.ones(XT_res.shape[1])
-        diagonal[:T_res.shape[1]] = 0
-        reg = self.lambda_reg * np.diag(diagonal)
-        # Ridge regression estimate
-        linear_coef_estimate = np.linalg.lstsq(np.matmul(weighted_XT_res.T, XT_res) + reg,
-                                               np.matmul(weighted_XT_res.T, Y_res.reshape(-1, 1)),
-                                               rcond=None)[0].flatten()
-        X_aug = np.append([1], X_single)
-        linear_coef_estimate = linear_coef_estimate.reshape((X_aug.shape[0], -1)).T
-        # Parameter returned is of shape (d_T, )
-        return np.dot(linear_coef_estimate, X_aug)
-
-
-def _DMLOrthoForest_moment_and_mean_gradient_estimator_func(Y, T, X, W,
-                                                            nuisance_estimates,
-                                                            parameter_estimate):
-    """Calculate the moments and mean gradient at points given by (Y, T, X, W)."""
-    # Return moments and gradients
-    # Compute residuals
-    Y_res, T_res = _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates)
-    # Compute moments
-    # Moments shape is (n, d_T)
-    moments = (Y_res - np.matmul(T_res, parameter_estimate)).reshape(-1, 1) * T_res
-    # Compute moment gradients
-    mean_gradient = - np.matmul(T_res.T, T_res) / T_res.shape[0]
-    return moments, mean_gradient
-
-
-def _DMLOrthoForest_get_conforming_residuals(Y, T, nuisance_estimates):
-    if nuisance_estimates == 0:
-        return reshape_Y_T(Y, T)
-    # returns shape-conforming residuals
-    Y_hat, T_hat = reshape_Y_T(*nuisance_estimates)
-    Y, T = reshape_Y_T(Y, T)
-    Y_res, T_res = Y - Y_hat, T - T_hat
-    return Y_res, T_res
-
-
-class DROrthoForest(BaseOrthoForest):
-    """
-    OrthoForest for discrete treatments using the doubly robust moment function.
-
-    A two-forest approach for learning heterogeneous treatment effects using
-    kernel two stage estimation.
-
-    Parameters
-    ----------
-    n_trees : integer, optional (default=500)
-        Number of causal estimators in the forest.
-
-    min_leaf_size : integer, optional (default=10)
-        The minimum number of samples in a leaf.
-
-    max_depth : integer, optional (default=10)
-        The maximum number of splits to be performed when expanding the tree.
-
-    subsample_ratio : float, optional (default=0.7)
-        The ratio of the total sample to be used when training a causal tree.
-        Values greater than 1.0 will be considered equal to 1.0.
-        Parameter is ignored when bootstrap=True.
-
-    bootstrap : boolean, optional (default=False)
-        Whether to use bootstrap subsampling.
-
-    lambda_reg : float, optional (default=0.01)
-        The regularization coefficient in the ell_2 penalty imposed on the
-        locally linear part of the second stage fit. This is not applied to
-        the local intercept, only to the coefficient of the linear component.
-
-    propensity_model : estimator, optional (default=sklearn.linear_model.LogisticRegression(penalty='l1',\
-                                                                                             solver='saga',\
-                                                                                             multi_class='auto'))
-        Model for estimating propensity of treatment at each leaf.
-        Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
-
-    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
-        Estimator for learning potential outcomes at each leaf.
-        Will be trained on features, controls and one hot encoded treatments (concatenated).
-        If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
-        helper class. The model(s) must implement `fit` and `predict` methods.
-
-    propensity_model_final : estimator, optional (default=None)
-        Model for estimating propensity of treatment at at prediction time.
-        Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
-        If parameter is set to ``None``, it defaults to the value of `propensity_model` parameter.
-
-    model_Y_final : estimator, optional (default=None)
-        Estimator for learning potential outcomes at prediction time.
-        Will be trained on features, controls and one hot encoded treatments (concatenated).
-        If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
-        helper class. The model(s) must implement `fit` and `predict` methods.
-        If parameter is set to ``None``, it defaults to the value of `model_Y` parameter.
-
-    categories: 'auto' or list
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    n_jobs : int, optional (default=-1)
-        The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
-        ``-1`` means using all processors. Since OrthoForest methods are
-        computationally heavy, it is recommended to set `n_jobs` to -1.
-
-    backend : 'threading' or 'loky', optional (default='loky')
-        What backend should be used for parallelization with the joblib library.
-
-    verbose : int, optional (default=3)
-        Verbosity level
-
-    batch_size : int or 'auto', optional (default='auto')
-        Batch_size of jobs for parallelism
-
-    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-
-    """
-
-    def __init__(self, *,
-                 n_trees=500,
-                 min_leaf_size=10, max_depth=10,
-                 subsample_ratio=0.7,
-                 bootstrap=False,
-                 lambda_reg=0.01,
-                 propensity_model=LogisticRegression(penalty='l1', solver='saga',
-                                                     multi_class='auto'),  # saga solver supports l1
-                 model_Y=WeightedLassoCVWrapper(cv=3),
-                 propensity_model_final=None,
-                 model_Y_final=None,
-                 categories='auto',
-                 n_jobs=-1,
-                 backend='loky',
-                 verbose=3,
-                 batch_size='auto',
-                 random_state=None):
-        # Copy and/or define models
-        self.propensity_model = clone(propensity_model, safe=False)
-        self.model_Y = clone(model_Y, safe=False)
-        self.propensity_model_final = clone(propensity_model_final, safe=False)
-        self.model_Y_final = clone(model_Y_final, safe=False)
-        if self.propensity_model_final is None:
-            self.propensity_model_final = clone(self.propensity_model, safe=False)
-        if self.model_Y_final is None:
-            self.model_Y_final = clone(self.model_Y, safe=False)
-        self.random_state = check_random_state(random_state)
-
-        nuisance_estimator = DROrthoForest.nuisance_estimator_generator(
-            self.propensity_model, self.model_Y, self.random_state, second_stage=False)
-        second_stage_nuisance_estimator = DROrthoForest.nuisance_estimator_generator(
-            self.propensity_model_final, self.model_Y_final, self.random_state, second_stage=True)
-        # Define parameter estimators
-        parameter_estimator = DROrthoForest.parameter_estimator_func
-        second_stage_parameter_estimator = DROrthoForest.second_stage_parameter_estimator_gen(
-            lambda_reg)
-        # Define moment and mean gradient estimator
-        moment_and_mean_gradient_estimator = DROrthoForest.moment_and_mean_gradient_estimator_func
-        if categories != 'auto':
-            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
-
-        super().__init__(
-            nuisance_estimator,
-            second_stage_nuisance_estimator,
-            parameter_estimator,
-            second_stage_parameter_estimator,
-            moment_and_mean_gradient_estimator,
-            discrete_treatment=True,
-            categories=categories,
-            n_trees=n_trees,
-            min_leaf_size=min_leaf_size,
-            max_depth=max_depth,
-            subsample_ratio=subsample_ratio,
-            bootstrap=bootstrap,
-            n_jobs=n_jobs,
-            backend=backend,
-            verbose=verbose,
-            batch_size=batch_size,
-            random_state=self.random_state)
-
-    @_deprecate_positional("X and W should be passed by keyword only. In a future release "
-                           "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X, W=None, *, inference='auto'):
-        """Build an orthogonal random forest from a training set (Y, T, X, W).
-
-        Parameters
-        ----------
-        Y : array-like, shape (n, )
-            Outcome for the treatment policy. Must be a vector.
-
-        T : array-like, shape (n, )
-            Discrete treatment policy vector. The treatment policy should be a set of consecutive integers
-            starting with `0`, where `0` denotes the control group. Otherwise, the treatment policies
-            will be ordered lexicographically, with the smallest value being considered the control group.
-
-        X : array-like, shape (n, d_x)
-            Feature vector that captures heterogeneity.
-
-        W : array-like, shape (n, d_w) or None (default=None)
-            High-dimensional controls.
-
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`) and 'blb' (or an instance of :class:`BLBInference`)
-
-        Returns
-        -------
-        self: an instance of self.
-        """
-        self._set_input_names(Y, T, X, set_flag=True)
-        Y, T, X, W = check_inputs(Y, T, X, W)
-        # Check that T is shape (n, )
-        # Check T is numeric
-        T = self._check_treatment(T)
-        d_t_in = T.shape[1:]
-        # Train label encoder
-        T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
-        self._d_t = T.shape[1:]
-        self.transformer = FunctionTransformer(
-            func=_EncoderWrapper(self._one_hot_encoder).encode,
-            validate=False)
-        # Call `fit` from parent class
-        super().fit(Y, T, X=X, W=W, inference=inference)
-
-        # weirdness of wrap_fit. We need to store d_t_in. But because wrap_fit decorates the parent
-        # fit, we need to set explicitly d_t_in here after super fit is called.
-        self._d_t_in = d_t_in
-        return self
-
-    def const_marginal_effect(self, X):
-        X = check_array(X)
-        # Override to flatten output if T is flat
-        effects = super().const_marginal_effect(X=X)
-        return effects.reshape((-1,) + self._d_y + self._d_t)
-    const_marginal_effect.__doc__ = BaseOrthoForest.const_marginal_effect.__doc__
-
-    @staticmethod
-    def nuisance_estimator_generator(propensity_model, model_Y, random_state=None, second_stage=False):
-        """Generate nuissance estimator given model inputs from the class."""
-        def nuisance_estimator(Y, T, X, W, sample_weight=None, split_indices=None):
-            # Expand one-hot encoding to include the zero treatment
-            ohe_T = np.hstack([np.all(1 - T, axis=1, keepdims=True), T])
-            # Test that T contains all treatments. If not, return None
-            T = ohe_T @ np.arange(ohe_T.shape[1])
-            if len(np.unique(T)) < ohe_T.shape[1]:
-                return None
-            # Nuissance estimates evaluated with cross-fitting
-            this_random_state = check_random_state(random_state)
-            if (split_indices is None) and second_stage:
-                # Define 2-fold iterator
-                kfold_it = StratifiedKFold(n_splits=2, shuffle=True, random_state=this_random_state).split(X, T)
-                # Check if there is only one example of some class
-                with warnings.catch_warnings():
-                    warnings.filterwarnings('error')
-                    try:
-                        split_indices = list(kfold_it)[0]
-                    except Warning as warn:
-                        msg = str(warn)
-                        if "The least populated class in y has only 1 members" in msg:
-                            return None
-            if W is not None:
-                X_tilde = np.concatenate((X, W), axis=1)
-            else:
-                X_tilde = X
-            try:
-                if not second_stage:
-                    # No need to crossfit for internal nodes
-                    propensity_model_clone = clone(propensity_model, safe=False)
-                    propensity_model_clone.fit(X_tilde, T)
-                    propensities = propensity_model_clone.predict_proba(X_tilde)
-                    Y_hat = _group_predict(X_tilde, ohe_T.shape[1],
-                                           clone(model_Y, safe=False).fit(np.hstack([X_tilde, ohe_T]), Y).predict)
-                else:
-                    propensities = _cross_fit(propensity_model, X_tilde, T, split_indices,
-                                              sample_weight=sample_weight, predict_func_name='predict_proba')
-                    Y_hat = _group_cross_fit(model_Y, X_tilde, Y, ohe_T, split_indices, sample_weight=sample_weight)
-            except ValueError as exc:
-                raise ValueError("The original error: {0}".format(str(exc)) +
-                                 " This might be caused by too few sample in the tree leafs." +
-                                 " Try increasing the min_leaf_size.")
-            return Y_hat, propensities
-        return nuisance_estimator
-
-    @staticmethod
-    def parameter_estimator_func(Y, T, X,
-                                 nuisance_estimates,
-                                 sample_weight=None):
-        """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates."""
-        # Compute partial moments
-        pointwise_params = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
-        param_estimate = np.average(pointwise_params, weights=sample_weight, axis=0)
-        # If any of the values in the parameter estimate is nan, return None
-        return param_estimate
-
-    @staticmethod
-    def second_stage_parameter_estimator_gen(lambda_reg):
-        """
-        For the second stage parameter estimation we add a local linear correction. So
-        we fit a local linear function as opposed to a local constant function. We also penalize
-        the linear part to reduce variance.
-        """
-        def parameter_estimator_func(Y, T, X,
-                                     nuisance_estimates,
-                                     sample_weight,
-                                     X_single):
-            """Calculate the parameter of interest for points given by (Y, T) and corresponding nuisance estimates.
-
-            The parameter is calculated around the feature vector given by `X_single`. `X_single` can be used to do
-            local corrections on a preliminary parameter estimate.
-            """
-            # Compute partial moments
-            pointwise_params = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
-            X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
-            # Compute coefficient by OLS on residuals
-            if sample_weight is not None:
-                weighted_X_aug = sample_weight.reshape(-1, 1) * X_aug
-            else:
-                weighted_X_aug = X_aug / X_aug.shape[0]
-            # ell_2 regularization
-            diagonal = np.ones(X_aug.shape[1])
-            diagonal[0] = 0
-            reg = lambda_reg * np.diag(diagonal)
-            # Ridge regression estimate
-            linear_coef_estimate = np.linalg.lstsq(np.matmul(weighted_X_aug.T, X_aug) + reg,
-                                                   np.matmul(weighted_X_aug.T, pointwise_params),
-                                                   rcond=None)[0].flatten()
-            X_aug = np.append([1], X_single)
-            linear_coef_estimate = linear_coef_estimate.reshape((X_aug.shape[0], -1)).T
-            # Parameter returned is of shape (d_T, )
-            return np.dot(linear_coef_estimate, X_aug)
-
-        return parameter_estimator_func
-
-    @staticmethod
-    def moment_and_mean_gradient_estimator_func(Y, T, X, W,
-                                                nuisance_estimates,
-                                                parameter_estimate):
-        """Calculate the moments and mean gradient at points given by (Y, T, X, W)."""
-        # Return moments and gradients
-        # Compute partial moments
-        partial_moments = DROrthoForest._partial_moments(Y, T, nuisance_estimates)
-        # Compute moments
-        # Moments shape is (n, d_T-1)
-        moments = partial_moments - parameter_estimate
-        # Compute moment gradients
-        n_T = nuisance_estimates[0].shape[1] - 1
-        mean_gradient = np.diag(np.ones(n_T) * (-1))
-        return moments, mean_gradient
-
-    @staticmethod
-    def _partial_moments(Y, T, nuisance_estimates):
-        Y_hat, propensities = nuisance_estimates
-        partial_moments = np.zeros((len(Y), Y_hat.shape[1] - 1))
-        T = T @ np.arange(1, T.shape[1] + 1)
-        mask_0 = (T == 0)
-        for i in range(0, Y_hat.shape[1] - 1):
-            # Need to calculate this in an elegant way for when propensity is 0
-            partial_moments[:, i] = Y_hat[:, i + 1] - Y_hat[:, 0]
-            mask_i = (T == (i + 1))
-            partial_moments[:, i][mask_i] += (Y - Y_hat[:, i + 1])[mask_i] / propensities[:, i + 1][mask_i]
-            partial_moments[:, i][mask_0] -= (Y - Y_hat[:, 0])[mask_0] / propensities[:, 0][mask_0]
-        return partial_moments
-
-    def _check_treatment(self, T):
-        try:
-            # This will flatten T
-            T = column_or_1d(T)
-        except Exception as exc:
-            raise ValueError("Expected array of shape ({n}, ), but got {T_shape}".format(n=len(T), T_shape=T.shape))
-        # Check that T is numeric
-        try:
-            T.astype(float)
-        except Exception as exc:
-            raise ValueError("Expected numeric array but got non-numeric types.")
-        return T
-
-
-class BLBInference(Inference):
-    """
-    Bootstrap-of-Little-Bags inference implementation for the OrthoForest classes.
-
-    This class can only be used for inference with any estimator derived from :class:`BaseOrthoForest`.
-
-    Parameters
-    ----------
-    estimator : :class:`BaseOrthoForest`
-        Estimator to perform inference on. Must be a child class of :class:`BaseOrthoForest`.
-    """
-
-    def fit(self, estimator, *args, **kwargs):
-        """
-        Fits the inference model.
-
-        This is called after the estimator's fit.
-        """
-        self._estimator = estimator
-        self._input_names = estimator._input_names
-        # Test whether the input estimator is supported
-        if not hasattr(self._estimator, "_predict"):
-            raise TypeError("Unsupported estimator of type {}.".format(self._estimator.__class__.__name__) +
-                            " Estimators must implement the '_predict' method with the correct signature.")
-        return self
-
-    def const_marginal_effect_interval(self, X=None, *, alpha=0.1):
-        """ Confidence intervals for the quantities :math:`\\theta(X)` produced
-        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
-        calling the fit method.
-
-        Parameters
-        ----------
-        X: optional (m, d_x) matrix or None (Default=None)
-            Features for each sample
-
-        alpha: optional float in [0, 1] (Default=0.1)
-            The overall level of confidence of the reported interval.
-            The alpha/2, 1-alpha/2 confidence interval is reported.
-
-        Returns
-        -------
-        lower, upper : tuple(type of :meth:`const_marginal_effect(X)<const_marginal_effect>` ,\
-                             type of :meth:`const_marginal_effect(X)<const_marginal_effect>` )
-            The lower and the upper bounds of the confidence interval for each quantity.
-        """
-        X = check_array(X)
-        params_and_cov = self._predict_wrapper(X)
-        # Calculate confidence intervals for the parameter (marginal effect)
-        lower = alpha / 2
-        upper = 1 - alpha / 2
-        param_lower = [param + np.apply_along_axis(lambda s: norm.ppf(lower, scale=s), 0, np.sqrt(np.diag(cov_mat)))
-                       for (param, cov_mat) in params_and_cov]
-        param_upper = [param + np.apply_along_axis(lambda s: norm.ppf(upper, scale=s), 0, np.sqrt(np.diag(cov_mat)))
-                       for (param, cov_mat) in params_and_cov]
-        param_lower, param_upper = np.asarray(param_lower), np.asarray(param_upper)
-        return param_lower.reshape((-1,) + self._estimator._d_y + self._estimator._d_t),\
-            param_upper.reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
-
-    def const_marginal_effect_inference(self, X=None):
-        """ Inference results for the quantities :math:`\\theta(X)` produced
-        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
-        calling the fit method.
-
-        Parameters
-        ----------
-        X: optional (m, d_x) matrix or None (Default=None)
-            Features for each sample
-
-        Returns
-        -------
-        InferenceResults: instance of :class:`~econml.inference.NormalInferenceResults`
-            The inference results instance contains prediction and prediction standard error and
-            can on demand calculate confidence interval, z statistic and p value. It can also output
-            a dataframe summary of these inference results.
-        """
-        X = check_array(X)
-        params, cov = zip(*(self._predict_wrapper(X)))
-        params = np.array(params).reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
-        stderr = np.sqrt(np.diagonal(np.array(cov), axis1=1, axis2=2))
-        stderr = stderr.reshape((-1,) + self._estimator._d_y + self._estimator._d_t)
-        return NormalInferenceResults(d_t=self._estimator._d_t[0] if self._estimator._d_t else 1,
-                                      d_y=self._estimator._d_y[0] if self._estimator._d_y else 1,
-                                      pred=params, pred_stderr=stderr, inf_type='effect', **self._input_names)
-
-    def _effect_inference_helper(self, X, T0, T1):
-        X, T0, T1 = self._estimator._expand_treatments(*check_input_arrays(X, T0, T1))
-        dT = (T1 - T0) if T0.ndim == 2 else (T1 - T0).reshape(-1, 1)
-        params_and_cov = self._predict_wrapper(X)
-        # Calculate confidence intervals for the effect
-        # Calculate the effects
-        eff = np.asarray([np.dot(params_and_cov[i][0], dT[i]) for i in range(X.shape[0])])
-        # Calculate the standard deviations for the effects
-        scales = np.asarray([np.sqrt(dT[i] @ params_and_cov[i][1] @ dT[i]) for i in range(X.shape[0])])
-        return eff.reshape((-1,) + self._estimator._d_y), scales.reshape((-1,) + self._estimator._d_y)
-
-    def effect_interval(self, X=None, *, T0=0, T1=1, alpha=0.1):
-        """ Confidence intervals for the quantities :math:`\\tau(X, T0, T1)` produced
-        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
-        calling the fit method.
-
-        Parameters
-        ----------
-        X: optional (m, d_x) matrix
-            Features for each sample
-        T0: optional (m, d_t) matrix or vector of length m (Default=0)
-            Base treatments for each sample
-        T1: optional (m, d_t) matrix or vector of length m (Default=1)
-            Target treatments for each sample
-        alpha: optional float in [0, 1] (Default=0.1)
-            The overall level of confidence of the reported interval.
-            The alpha/2, 1-alpha/2 confidence interval is reported.
-
-        Returns
-        -------
-        lower, upper : tuple(type of :meth:`effect(X, T0, T1)<effect>`, type of :meth:`effect(X, T0, T1))<effect>` )
-            The lower and the upper bounds of the confidence interval for each quantity.
-        """
-        eff, scales = self._effect_inference_helper(X, T0, T1)
-        lower = alpha / 2
-        upper = 1 - alpha / 2
-        effect_lower = eff + np.apply_along_axis(lambda s: norm.ppf(lower, scale=s), 0, scales)
-        effect_upper = eff + np.apply_along_axis(lambda s: norm.ppf(upper, scale=s), 0, scales)
-        return effect_lower, effect_upper
-
-    def effect_inference(self, X=None, *, T0=0, T1=1):
-        """ Inference results for the quantities :math:`\\tau(X, T0, T1)` produced
-        by the model. Available only when ``inference`` is ``blb`` or ``auto``, when
-        calling the fit method.
-
-        Parameters
-        ----------
-        X: optional (m, d_x) matrix
-            Features for each sample
-        T0: optional (m, d_t) matrix or vector of length m (Default=0)
-            Base treatments for each sample
-        T1: optional (m, d_t) matrix or vector of length m (Default=1)
-            Target treatments for each sample
-
-        Returns
-        -------
-        InferenceResults: instance of :class:`~econml.inference.NormalInferenceResults`
-            The inference results instance contains prediction and prediction standard error and
-            can on demand calculate confidence interval, z statistic and p value. It can also output
-            a dataframe summary of these inference results.
-        """
-        eff, scales = self._effect_inference_helper(X, T0, T1)
-        return NormalInferenceResults(d_t=1, d_y=self._estimator._d_y[0] if self._estimator._d_y else 1,
-                                      pred=eff, pred_stderr=scales, inf_type='effect', **self._input_names)
-
-    def _predict_wrapper(self, X=None):
-        return self._estimator._predict(X, stderr=True)
-
-
-@deprecated("The ContinuousTreatmentOrthoForest class has been renamed to DMLOrthoForest; "
-            "an upcoming release will remove support for the old name")
-class ContinuousTreatmentOrthoForest(DMLOrthoForest):
-    pass
-
-
-@deprecated("The DiscreteTreatmentOrthoForest class has been renamed to DROrthoForest; "
-            "an upcoming release will remove support for the old name")
-class DiscreteTreatmentOrthoForest(DROrthoForest):
-    pass
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.orf as orf
+from .utilities import deprecated
+
+
+@deprecated("The econml.ortho_forest.DMLOrthoForest class has been moved to econml.orf.DMLOrthoForest; "
+            "an upcoming release will remove support for the old name")
+class DMLOrthoForest(orf.DMLOrthoForest):
+    pass
+
+
+@deprecated("The econml.ortho_forest.DiscreteTreatmentOrthoForest class has been "
+            "moved to econml.orf.DROrthoForest; "
+            "an upcoming release will remove support for the old name")
+class DROrthoForest(orf.DROrthoForest):
+    pass
+
+
+@deprecated("The econml.ortho_forest.ContinuousTreatmentOrthoForest class has been "
+            "renamed to econml.orf.DMLOrthoForest; "
+            "an upcoming release will remove support for the old name")
+class ContinuousTreatmentOrthoForest(orf.DMLOrthoForest):
+    pass
+
+
+@deprecated("The econml.ortho_forest.DiscreteTreatmentOrthoForest class has been "
+            "renamed to econml.orf.DROrthoForest; "
+            "an upcoming release will remove support for the old name")
+class DiscreteTreatmentOrthoForest(orf.DROrthoForest):
+    pass
diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py
index efe748c3..341946f4 100644
--- a/econml/ortho_iv.py
+++ b/econml/ortho_iv.py
@@ -1,1769 +1,43 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Orthogonal IV for Heterogeneous Treatment Effects.
-
-A Double/Orthogonal machine learning approach to estimation of heterogeneous
-treatment effect with an endogenous treatment and an instrument. It
-implements the DMLIV and related algorithms from the paper:
-
-Machine Learning Estimation of Heterogeneous Treatment Effects with Instruments
-Vasilis Syrgkanis, Victor Lei, Miruna Oprescu, Maggie Hei, Keith Battocchi, Greg Lewis
-https://arxiv.org/abs/1905.10176
-
-"""
-
-import numpy as np
-from sklearn.base import clone
-from sklearn.linear_model import LinearRegression
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import FunctionTransformer
-
-from ._ortho_learner import _OrthoLearner
-from ._cate_estimator import LinearModelFinalCateEstimatorMixin, StatsModelsCateEstimatorMixin
-from .dml.dml import _FinalWrapper
-from .inference import StatsModelsInference
-from .sklearn_extensions.linear_model import StatsModelsLinearRegression
-from .utilities import (_deprecate_positional, add_intercept, fit_with_groups, filter_none_kwargs,
-                        hstack, inverse_onehot)
-
-
-# A cut-down version of the DML first stage wrapper, since we don't need to support linear first stages
-class _FirstStageWrapper:
-    def __init__(self, model, discrete_target):
-        self._model = clone(model, safe=False)
-        self._discrete_target = discrete_target
-
-    def _combine(self, X, W, Z, n_samples, fitting=True):
-        # output is
-        #   * a column of ones if X, W, and Z are all None
-        #   * just X or W or Z if both of the others are None
-        #   * hstack([arrs]) for whatever subset are not None otherwise
-
-        # ensure Z is 2D
-        if Z is not None:
-            Z = Z.reshape(n_samples, -1)
-
-        if X is None and W is None and Z is None:
-            return np.ones((n_samples, 1))
-
-        arrs = [arr for arr in [X, W, Z] if arr is not None]
-
-        if len(arrs) == 1:
-            return arrs[0]
-        else:
-            return hstack(arrs)
-
-    def fit(self, *, X, W, Target, Z=None, sample_weight=None, groups=None):
-        if self._discrete_target:
-            # In this case, the Target is the one-hot-encoding of the treatment variable
-            # We need to go back to the label representation of the one-hot so as to call
-            # the classifier.
-            if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
-                raise AttributeError("Provided crossfit folds contain training splits that " +
-                                     "don't contain all treatments")
-            Target = inverse_onehot(Target)
-
-        if sample_weight is not None:
-            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
-                            groups=groups, sample_weight=sample_weight)
-        else:
-            fit_with_groups(self._model, self._combine(X, W, Z, Target.shape[0]), Target,
-                            groups=groups)
-
-    def score(self, *, X, W, Target, Z=None, sample_weight=None):
-        if hasattr(self._model, 'score'):
-            if self._discrete_target:
-                # In this case, the Target is the one-hot-encoding of the treatment variable
-                # We need to go back to the label representation of the one-hot so as to call
-                # the classifier.
-                if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
-                    raise AttributeError("Provided crossfit folds contain training splits that " +
-                                         "don't contain all treatments")
-                Target = inverse_onehot(Target)
-
-            if sample_weight is not None:
-                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target, sample_weight=sample_weight)
-            else:
-                return self._model.score(self._combine(X, W, Z, Target.shape[0]), Target)
-        else:
-            return None
-
-    def predict(self, X, W, Z=None):
-        arrs = [arr for arr in [X, W, Z] if arr is not None]
-        n_samples = arrs[0].shape[0] if arrs else 1
-        if self._discrete_target:
-            return self._model.predict_proba(self._combine(X, W, Z, n_samples, fitting=False))[:, 1:]
-        else:
-            return self._model.predict(self._combine(X, W, Z, n_samples, fitting=False))
-
-
-class _BaseDMLATEIVModelFinal:
-    def __init__(self):
-        self._first_stage = LinearRegression(fit_intercept=False)
-        self._model_final = _FinalWrapper(LinearRegression(fit_intercept=False),
-                                          fit_cate_intercept=True, featurizer=None, use_weight_trick=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        Y_res, T_res, Z_res = nuisances
-        if Z_res.ndim == 1:
-            Z_res = Z_res.reshape(-1, 1)
-        # DMLATEIV is just like 2SLS; first regress T_res on Z_res, then regress Y_res on predicted T_res
-        T_res_pred = self._first_stage.fit(Z_res, T_res,
-                                           sample_weight=sample_weight).predict(Z_res)
-        # TODO: allow the final model to actually use X? Then we'd need to rename the class
-        #       since we would actually be calculating a CATE rather than ATE.
-        self._model_final.fit(X=None, T_res=T_res_pred, Y_res=Y_res, sample_weight=sample_weight)
-        return self
-
-    def predict(self, X=None):
-        # TODO: allow the final model to actually use X?
-        return self._model_final.predict(X=None)
-
-    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        Y_res, T_res, Z_res = nuisances
-        if Y_res.ndim == 1:
-            Y_res = Y_res.reshape((-1, 1))
-        if T_res.ndim == 1:
-            T_res = T_res.reshape((-1, 1))
-        # TODO: allow the final model to actually use X?
-        effects = self._model_final.predict(X=None).reshape((-1, Y_res.shape[1], T_res.shape[1]))
-        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
-        if sample_weight is not None:
-            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
-        else:
-            return np.mean((Y_res - Y_res_pred) ** 2)
-
-
-class _BaseDMLATEIV(_OrthoLearner):
-    def __init__(self, discrete_instrument=False,
-                 discrete_treatment=False,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        super().__init__(discrete_treatment=discrete_treatment,
-                         discrete_instrument=discrete_instrument,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_ortho_learner_model_final(self):
-        return _BaseDMLATEIVModelFinal()
-
-    @_deprecate_positional("W and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing W and Z by position.", ['W', 'Z'])
-    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: (n, d_z) matrix
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string,:class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of:class:`.BootstrapInference`).
-
-        Returns
-        -------
-        self: _BaseDMLATEIV instance
-        """
-        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
-        return super().fit(Y, T, W=W, Z=Z,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def score(self, Y, T, Z, W=None):
-        """
-        Score the fitted CATE model on a new data set. Generates nuisance parameters
-        for the new data set based on the fitted residual nuisance models created at fit time.
-        It uses the mean prediction of the models fitted by the different crossfit folds.
-        Then calculates the MSE of the final residual Y on residual T regression.
-
-        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: optional(n, d_z) matrix
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-
-
-        Returns
-        -------
-        score: float
-            The MSE of the final CATE model on the new data.
-        """
-        # Replacing score from _OrthoLearner, to enforce X=None and improve the docstring
-        return super().score(Y, T, W=W, Z=Z)
-
-
-class _DMLATEIVModelNuisance:
-    def __init__(self, model_Y_W, model_T_W, model_Z_W):
-        self._model_Y_W = clone(model_Y_W, safe=False)
-        self._model_T_W = clone(model_T_W, safe=False)
-        self._model_Z_W = clone(model_Z_W, safe=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        assert X is None, "DML ATE IV does not accept features"
-        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
-        self._model_Z_W.fit(X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
-        return self
-
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert X is None, "DML ATE IV does not accept features"
-        if hasattr(self._model_Y_W, 'score'):
-            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
-        else:
-            Y_X_score = None
-        if hasattr(self._model_T_W, 'score'):
-            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
-        else:
-            T_X_score = None
-        if hasattr(self._model_Z_W, 'score'):
-            Z_X_score = self._model_Z_W.score(X=X, W=W, Target=Z, sample_weight=sample_weight)
-        else:
-            Z_X_score = None
-        return Y_X_score, T_X_score, Z_X_score
-
-    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert X is None, "DML ATE IV does not accept features"
-        Y_pred = self._model_Y_W.predict(X=X, W=W)
-        T_pred = self._model_T_W.predict(X=X, W=W)
-        Z_pred = self._model_Z_W.predict(X=X, W=W)
-        if W is None:  # In this case predict above returns a single row
-            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
-            T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
-            Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
-        Y_res = Y - Y_pred.reshape(Y.shape)
-        T_res = T - T_pred.reshape(T.shape)
-        Z_res = Z - Z_pred.reshape(Z.shape)
-        return Y_res, T_res, Z_res
-
-
-class DMLATEIV(_BaseDMLATEIV):
-    """
-    Implementation of the orthogonal/double ml method for ATE estimation with
-    IV as described in
-
-    Double/Debiased Machine Learning for Treatment and Causal Parameters
-    Victor Chernozhukov, Denis Chetverikov, Mert Demirer, Esther Duflo, Christian Hansen, Whitney Newey, James Robins
-    https://arxiv.org/abs/1608.00060
-
-    Requires that either co-variance of T, Z is independent of X or that effect
-    is not heterogeneous in X for correct recovery. Otherwise it estimates
-    a biased ATE.
-    """
-
-    def __init__(self, *,
-                 model_Y_W,
-                 model_T_W,
-                 model_Z_W,
-                 discrete_treatment=False,
-                 discrete_instrument=False,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.model_Y_W = clone(model_Y_W, safe=False)
-        self.model_T_W = clone(model_T_W, safe=False)
-        self.model_Z_W = clone(model_Z_W, safe=False)
-        super().__init__(discrete_instrument=discrete_instrument,
-                         discrete_treatment=discrete_treatment,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _DMLATEIVModelNuisance(
-            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
-            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
-            model_Z_W=_FirstStageWrapper(clone(self.model_Z_W, safe=False), discrete_target=self.discrete_instrument))
-
-
-class _ProjectedDMLATEIVModelNuisance:
-
-    def __init__(self, model_Y_W, model_T_W, model_T_WZ):
-        self._model_Y_W = clone(model_Y_W, safe=False)
-        self._model_T_W = clone(model_T_W, safe=False)
-        self._model_T_WZ = clone(model_T_WZ, safe=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        assert X is None, "DML ATE IV does not accept features"
-        self._model_Y_W.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_T_W.fit(X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
-        self._model_T_WZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
-        return self
-
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert X is None, "DML ATE IV does not accept features"
-        if hasattr(self._model_Y_W, 'score'):
-            Y_X_score = self._model_Y_W.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
-        else:
-            Y_X_score = None
-        if hasattr(self._model_T_W, 'score'):
-            T_X_score = self._model_T_W.score(X=X, W=W, Target=T, sample_weight=sample_weight)
-        else:
-            T_X_score = None
-        if hasattr(self._model_T_WZ, 'score'):
-            T_XZ_score = self._model_T_WZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
-        else:
-            T_XZ_score = None
-        return Y_X_score, T_X_score, T_XZ_score
-
-    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert X is None, "DML ATE IV does not accept features"
-        Y_pred = self._model_Y_W.predict(X, W)
-        TX_pred = self._model_T_W.predict(X, W)
-        TXZ_pred = self._model_T_WZ.predict(X, W, Z)
-        if W is None:  # In this case predict above returns a single row
-            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
-            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
-        Y_res = Y - Y_pred.reshape(Y.shape)
-        T_res = T - TX_pred.reshape(T.shape)
-        Z_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
-        return Y_res, T_res, Z_res
-
-
-class ProjectedDMLATEIV(_BaseDMLATEIV):
-
-    def __init__(self, *,
-                 model_Y_W,
-                 model_T_W,
-                 model_T_WZ,
-                 discrete_treatment=False,
-                 discrete_instrument=False,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.model_Y_W = clone(model_Y_W, safe=False)
-        self.model_T_W = clone(model_T_W, safe=False)
-        self.model_T_WZ = clone(model_T_WZ, safe=False)
-        super().__init__(discrete_instrument=discrete_instrument,
-                         discrete_treatment=discrete_treatment,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _ProjectedDMLATEIVModelNuisance(
-            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
-            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
-            model_T_WZ=_FirstStageWrapper(clone(self.model_T_WZ, safe=False),
-                                          discrete_target=self.discrete_treatment))
-
-
-class _BaseDMLIVModelNuisance:
-    """
-    Nuisance model fits the three models at fit time and at predict time
-    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
-    """
-
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ):
-        self._model_Y_X = clone(model_Y_X, safe=False)
-        self._model_T_X = clone(model_T_X, safe=False)
-        self._model_T_XZ = clone(model_T_XZ, safe=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        # TODO: would it be useful to extend to handle controls ala vanilla DML?
-        assert W is None, "DML IV does not accept controls"
-        self._model_Y_X.fit(X=X, W=None, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_T_X.fit(X=X, W=None, Target=T, sample_weight=sample_weight, groups=groups)
-        self._model_T_XZ.fit(X=X, W=None, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
-        return self
-
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert W is None, "DML IV does not accept controls"
-        if hasattr(self._model_Y_X, 'score'):
-            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
-        else:
-            Y_X_score = None
-        if hasattr(self._model_T_X, 'score'):
-            T_X_score = self._model_T_X.score(X=X, W=W, Target=T, sample_weight=sample_weight)
-        else:
-            T_X_score = None
-        if hasattr(self._model_T_XZ, 'score'):
-            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
-        else:
-            T_XZ_score = None
-        return Y_X_score, T_X_score, T_XZ_score
-
-    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        assert W is None, "DML IV does not accept controls"
-        Y_pred = self._model_Y_X.predict(X, W)
-        TXZ_pred = self._model_T_XZ.predict(X, W, Z)
-        TX_pred = self._model_T_X.predict(X, W)
-        if X is None:  # In this case predict above returns a single row
-            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
-            TX_pred = np.tile(TX_pred.reshape(1, -1), (T.shape[0], 1))
-        Y_res = Y - Y_pred.reshape(Y.shape)
-        T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
-        return Y_res, T_res
-
-
-class _BaseDMLIVModelFinal:
-    """
-    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
-    that depends on X, i.e.
-
-        .. math ::
-            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
-
-    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
-    residual on residual regression.
-    """
-
-    def __init__(self, model_final):
-        self._model_final = clone(model_final, safe=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        Y_res, T_res = nuisances
-        self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var)
-        return self
-
-    def predict(self, X=None):
-        return self._model_final.predict(X)
-
-    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        Y_res, T_res = nuisances
-        if Y_res.ndim == 1:
-            Y_res = Y_res.reshape((-1, 1))
-        if T_res.ndim == 1:
-            T_res = T_res.reshape((-1, 1))
-        effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
-        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
-        if sample_weight is not None:
-            return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
-        else:
-            return np.mean((Y_res - Y_res_pred)**2)
-
-
-class _BaseDMLIV(_OrthoLearner):
-    """
-    The class _BaseDMLIV implements the base class of the DMLIV
-    algorithm for estimating a CATE. It accepts three generic machine
-    learning models:
-    1) model_Y_X that estimates :math:`\\E[Y | X]`
-    2) model_T_X that estimates :math:`\\E[T | X]`
-    3) model_T_XZ that estimates :math:`\\E[T | X, Z]`
-    These are estimated in a cross-fitting manner for each sample in the training set.
-    Then it minimizes the square loss:
-
-    .. math::
-        \\sum_i (Y_i - \\E[Y|X_i] - \theta(X) * (\\E[T|X_i, Z_i] - \\E[T|X_i]))^2
-
-    This loss is minimized by the model_final class, which is passed as an input.
-    In the two children classes {DMLIV, GenericDMLIV}, we implement different strategies of how to invoke
-    machine learning algorithms to minimize this final square loss.
-
-
-    Parameters
-    ----------
-    model_Y_X : estimator
-        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
-
-    model_T_X : estimator
-        model to estimate :math:`\\E[T | X]`.  Must support `fit` and `predict` methods
-
-    model_T_XZ : estimator
-        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit(X, Z, T, *, sample_weights)`
-        and `predict(X, Z)` methods.
-
-    model_final : estimator
-        final model that at fit time takes as input :math:`(Y-\\E[Y|X])`, :math:`(\\E[T|X,Z]-\\E[T|X])` and X
-        and supports method predict(X) that produces the CATE at X
-
-    discrete_instrument: bool, optional, default False
-        Whether the instrument values should be treated as categorical, rather than continuous, quantities
-
-    discrete_treatment: bool, optional, default False
-        Whether the treatment values should be treated as categorical, rather than continuous, quantities
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional, default 2
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None, mc_agg='mean',
-                 random_state=None):
-        super().__init__(discrete_treatment=discrete_treatment,
-                         discrete_instrument=discrete_instrument,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    @_deprecate_positional("Z and X should be passed by keyword only. In a future release "
-                           "we will disallow passing Z and X by position.", ['X', 'Z'])
-    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: (n, d_z) matrix
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string,:class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of:class:`.BootstrapInference`).
-
-        Returns
-        -------
-        self: _BaseDMLIV
-        """
-        # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
-        return super().fit(Y, T, X=X, Z=Z,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def score(self, Y, T, Z, X=None):
-        """
-        Score the fitted CATE model on a new data set. Generates nuisance parameters
-        for the new data set based on the fitted residual nuisance models created at fit time.
-        It uses the mean prediction of the models fitted by the different crossfit folds.
-        Then calculates the MSE of the final residual Y on residual T regression.
-
-        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: optional(n, d_z) matrix
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-
-
-        Returns
-        -------
-        score: float
-            The MSE of the final CATE model on the new data.
-        """
-        # Replacing score from _OrthoLearner, to enforce W=None and improve the docstring
-        return super().score(Y, T, X=X, Z=Z)
-
-    @property
-    def original_featurizer(self):
-        return self.ortho_learner_model_final_._model_final._original_featurizer
-
-    @property
-    def featurizer_(self):
-        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
-        # for internal use by the library
-        return self.ortho_learner_model_final_._model_final._featurizer
-
-    @property
-    def model_final_(self):
-        # NOTE This is used by the inference methods and is more for internal use to the library
-        return self.ortho_learner_model_final_._model_final._model
-
-    @property
-    def model_cate(self):
-        """
-        Get the fitted final CATE model.
-
-        Returns
-        -------
-        model_cate: object of type(model_final)
-            An instance of the model_final object that was fitted after calling fit which corresponds
-            to the constant marginal CATE model.
-        """
-        return self.ortho_learner_model_final_._model_final._model
-
-    @property
-    def models_Y_X(self):
-        """
-        Get the fitted models for :math:`\\E[Y | X]`.
-
-        Returns
-        -------
-        models_Y_X: list of objects of type(`model_Y_X`)
-            A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
-            fold and is the model instance that was fitted for that training fold.
-        """
-        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
-
-    @property
-    def models_T_X(self):
-        """
-        Get the fitted models for :math:`\\E[T | X]`.
-
-        Returns
-        -------
-        models_T_X: list of objects of type(`model_T_X`)
-            A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
-            fold and is the model instance that was fitted for that training fold.
-        """
-        return [mdl._model_T_X._model for mdl in super().models_nuisance_]
-
-    @property
-    def models_T_XZ(self):
-        """
-        Get the fitted models for :math:`\\E[T | X, Z]`.
-
-        Returns
-        -------
-        models_T_XZ: list of objects of type(`model_T_XZ`)
-            A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
-            fold and is the model instance that was fitted for that training fold.
-        """
-        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
-
-    @property
-    def nuisance_scores_Y_X(self):
-        """
-        Get the scores for Y_X model on the out-of-sample training data
-        """
-        return self.nuisance_scores_[0]
-
-    @property
-    def nuisance_scores_T_X(self):
-        """
-        Get the scores for T_X model on the out-of-sample training data
-        """
-        return self.nuisance_scores_[1]
-
-    @property
-    def nuisance_scores_T_XZ(self):
-        """
-        Get the scores for T_XZ model on the out-of-sample training data
-        """
-        return self.nuisance_scores_[2]
-
-    def cate_feature_names(self, feature_names=None):
-        """
-        Get the output feature names.
-
-        Parameters
-        ----------
-        feature_names: list of strings of length X.shape[1] or None
-            The names of the input features. If None and X is a dataframe, it defaults to the column names
-            from the dataframe.
-
-        Returns
-        -------
-        out_feature_names: list of strings or None
-            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
-            final constant marginal CATE model is linear. It is the names of the features that are associated
-            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
-            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
-        """
-        if feature_names is None:
-            feature_names = self._input_names["feature_names"]
-        if self.original_featurizer is None:
-            return feature_names
-        elif hasattr(self.original_featurizer, 'get_feature_names'):
-            return self.original_featurizer.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
-
-
-class DMLIV(LinearModelFinalCateEstimatorMixin, _BaseDMLIV):
-    """
-    A child of the _BaseDMLIV class that specifies a particular effect model
-    where the treatment effect is linear in some featurization of the variable X
-    The features are created by a provided featurizer that supports fit_transform.
-    Then an arbitrary model fits on the composite set of features.
-
-    Concretely, it assumes that :math:`\\theta(X)=<\\theta, \\phi(X)>` for some features :math:`\\phi(X)`
-    and runs a linear model regression of :math:`Y-\\E[Y|X]` on :math:`phi(X)*(\\E[T|X,Z]-\\E[T|X])`.
-    The features are created by the featurizer provided by the user. The particular
-    linear model regression is also specified by the user (e.g. Lasso, ElasticNet)
-
-    Parameters
-    ----------
-    model_Y_X : estimator
-        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
-
-    model_T_X : estimator
-        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
-        depending on whether the treatment is discrete.
-
-    model_T_XZ : estimator
-        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
-        depending on whether the treatment is discrete.
-
-    model_final : estimator
-        final linear model for predicting :math:`(Y-\\E[Y|X])` from :math:`\\phi(X) \\cdot (\\E[T|X,Z]-\\E[T|X])`
-        Method is incorrect if this model is not linear (e.g. Lasso, ElasticNet, LinearRegression).
-
-    featurizer: :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    cv: int, cross-validation generator or an iterable, optional, default 2
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    discrete_instrument: bool, optional, default False
-        Whether the instrument values should be treated as categorical, rather than continuous, quantities
-
-    discrete_treatment: bool, optional, default False
-        Whether the treatment values should be treated as categorical, rather than continuous, quantities
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, *,
-                 model_Y_X,
-                 model_T_X,
-                 model_T_XZ,
-                 model_final,
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 discrete_instrument=False, discrete_treatment=False,
-                 categories='auto', random_state=None):
-        self.model_Y_X = clone(model_Y_X, safe=False)
-        self.model_T_X = clone(model_T_X, safe=False)
-        self.model_T_XZ = clone(model_T_XZ, safe=False)
-        self.model_final = clone(model_final, safe=False)
-        self.featurizer = clone(featurizer, safe=False)
-        self.fit_cate_intercept = fit_cate_intercept
-        super().__init__(cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         discrete_instrument=discrete_instrument,
-                         discrete_treatment=discrete_treatment,
-                         categories=categories,
-                         random_state=random_state)
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
-                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
-                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
-
-    def _gen_ortho_learner_model_final(self):
-        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
-                                                  fit_cate_intercept=self.fit_cate_intercept,
-                                                  featurizer=clone(self.featurizer, safe=False),
-                                                  use_weight_trick=False))
-
-    @property
-    def bias_part_of_coef(self):
-        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
-
-    @property
-    def fit_cate_intercept_(self):
-        return self.ortho_learner_model_final_._model_final._fit_cate_intercept
-
-
-class NonParamDMLIV(_BaseDMLIV):
-    """
-    A child of the _BaseDMLIV class that allows for an arbitrary square loss based ML
-    method in the final stage of the DMLIV algorithm. The method has to support
-    sample weights and the fit method has to take as input sample_weights (e.g. random forests), i.e.
-    fit(X, y, sample_weight=None)
-    It achieves this by re-writing the final stage square loss of the DMLIV algorithm as:
-
-    .. math ::
-        \\sum_i (\\E[T|X_i, Z_i] - \\E[T|X_i])^2 * ((Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i]) - \\theta(X))^2
-
-    Then this can be viewed as a weighted square loss regression, where the target label is
-
-    .. math ::
-        \\tilde{Y}_i = (Y_i - \\E[Y|X_i])/(\\E[T|X_i, Z_i] - \\E[T|X_i])
-
-    and each sample has a weight of
-
-    .. math ::
-        V(X_i) = (\\E[T|X_i, Z_i] - \\E[T|X_i])^2
-
-    Thus we can call any regression model with inputs:
-
-        fit(X, :math:`\\tilde{Y}_i`, sample_weight= :math:`V(X_i)`)
-
-    Parameters
-    ----------
-    model_Y_X : estimator
-        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
-
-    model_T_X : estimator
-        model to estimate :math:`\\E[T | X]`.  Must support `fit` and either `predict` or `predict_proba` methods,
-        depending on whether the treatment is discrete.
-
-    model_T_XZ : estimator
-        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and either `predict` or `predict_proba` methods,
-        depending on whether the treatment is discrete.
-
-    model_final : estimator
-        final model for predicting :math:`\\tilde{Y}` from X with sample weights V(X)
-
-    cv: int, cross-validation generator or an iterable, optional, default 2
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    discrete_instrument: bool, optional, default False
-        Whether the instrument values should be treated as categorical, rather than continuous, quantities
-
-    discrete_treatment: bool, optional, default False
-        Whether the treatment values should be treated as categorical, rather than continuous, quantities
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    """
-
-    def __init__(self, *,
-                 model_Y_X,
-                 model_T_X,
-                 model_T_XZ,
-                 model_final,
-                 featurizer=None,
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 discrete_instrument=False,
-                 discrete_treatment=False,
-                 categories='auto',
-                 random_state=None):
-        self.model_Y_X = clone(model_Y_X, safe=False)
-        self.model_T_X = clone(model_T_X, safe=False)
-        self.model_T_XZ = clone(model_T_XZ, safe=False)
-        self.model_final = clone(model_final, safe=False)
-        self.featurizer = clone(featurizer, safe=False)
-        super().__init__(cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         discrete_instrument=discrete_instrument,
-                         discrete_treatment=discrete_treatment,
-                         categories=categories,
-                         random_state=random_state)
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
-                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
-                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
-
-    def _gen_ortho_learner_model_final(self):
-        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
-                                                  fit_cate_intercept=False,
-                                                  featurizer=clone(self.featurizer, safe=False),
-                                                  use_weight_trick=True))
-
-
-class _BaseDRIVModelFinal:
-    """
-    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
-    that depends on X, i.e.
-
-        .. math ::
-            Y - \\E[Y | X] = \\theta(X) \\cdot (\\E[T | X, Z] - \\E[T | X]) + \\epsilon
-
-    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
-    residual on residual regression.
-    """
-
-    def __init__(self, model_final, featurizer,
-                 discrete_treatment, discrete_instrument,
-                 fit_cate_intercept, cov_clip, opt_reweighted):
-        self._model_final = clone(model_final, safe=False)
-        self._fit_cate_intercept = fit_cate_intercept
-        self._original_featurizer = clone(featurizer, safe=False)
-        self._discrete_treatment = discrete_treatment
-        self._discrete_instrument = discrete_instrument
-        if self._fit_cate_intercept:
-            add_intercept_trans = FunctionTransformer(add_intercept,
-                                                      validate=True)
-            if featurizer:
-                self._featurizer = Pipeline([('featurize', self._original_featurizer),
-                                             ('add_intercept', add_intercept_trans)])
-            else:
-                self._featurizer = add_intercept_trans
-        else:
-            self._featurizer = self._original_featurizer
-        self._cov_clip = cov_clip
-        self._opt_reweighted = opt_reweighted
-
-    def _effect_estimate(self, nuisances):
-        prel_theta, res_t, res_y, res_z, cov = [nuisance.reshape(nuisances[0].shape) for nuisance in nuisances]
-
-        # Estimate final model of theta(X) by minimizing the square loss:
-        # (prel_theta(X) + (Y_res - prel_theta(X) * T_res) * Z_res / cov[T,Z | X] - theta(X))^2
-        # We clip the covariance so that it is bounded away from zero, so as to reduce variance
-        # at the expense of some small bias. For points with very small covariance we revert
-        # to the model-based preliminary estimate and do not add the correction term.
-        cov_sign = np.sign(cov)
-        cov_sign[cov_sign == 0] = 1
-        clipped_cov = cov_sign * np.clip(np.abs(cov),
-                                         self._cov_clip, np.inf)
-        return prel_theta + (res_y - prel_theta * res_t) * res_z / clipped_cov, clipped_cov
-
-    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        self.d_y = Y.shape[1:]
-        self.d_t = nuisances[1].shape[1:]
-        self.d_z = nuisances[3].shape[1:]
-
-        # TODO: if opt_reweighted is False, we could change the logic to support multidimensional treatments,
-        #       instruments, and outcomes
-        if self.d_y and self.d_y[0] > 2:
-            raise AttributeError("DRIV only supports a single outcome")
-
-        if self.d_t and self.d_t[0] > 1:
-            if self._discrete_treatment:
-                raise AttributeError("DRIV only supports binary treatments")
-            else:
-                raise AttributeError("DRIV only supports single-dimensional continuous treatments")
-
-        if self.d_z and self.d_z[0] > 1:
-            if self._discrete_instrument:
-                raise AttributeError("DRIV only supports binary instruments")
-            else:
-                raise AttributeError("DRIV only supports single-dimensional continuous instruments")
-
-        theta_dr, clipped_cov = self._effect_estimate(nuisances)
-
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.fit_transform(X)
-        if self._opt_reweighted and (sample_weight is not None):
-            sample_weight = sample_weight * clipped_cov.ravel()**2
-        elif self._opt_reweighted:
-            sample_weight = clipped_cov.ravel()**2
-        self._model_final.fit(X, theta_dr, **filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var))
-
-        return self
-
-    def predict(self, X=None):
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.transform(X)
-        return self._model_final.predict(X).reshape((-1,) + self.d_y + self.d_t)
-
-    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
-        theta_dr, clipped_cov = self._effect_estimate(nuisances)
-
-        if (X is not None) and (self._featurizer is not None):
-            X = self._featurizer.transform(X)
-
-        if self._opt_reweighted and (sample_weight is not None):
-            sample_weight = sample_weight * clipped_cov.ravel()**2
-        elif self._opt_reweighted:
-            sample_weight = clipped_cov.ravel()**2
-
-        return np.average((theta_dr.ravel() - self._model_final.predict(X).ravel())**2,
-                          weights=sample_weight, axis=0)
-
-
-class _BaseDRIV(_OrthoLearner):
-
-    """
-    The _BaseDRIV algorithm for estimating CATE with IVs. It is the parent of the
-    two public classes {DRIV, ProjectedDRIV}
-
-    Parameters
-    ----------
-    nuisance_models : dictionary of nuisance models, with {'name_of_model' : EstimatorObject, ...}
-
-    model_final : estimator
-        model compatible with the sklearn regression API, used to fit the effect on X
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    cov_clip : float, optional, default 0.1
-        clipping of the covariate for regions with low "overlap", to reduce variance
-
-    opt_reweighted : bool, optional, default False
-        Whether to reweight the samples to minimize variance. If True then
-        model_final.fit must accept sample_weight as a kw argument. If True then
-        assumes the model_final is flexible enough to fit the true CATE model. Otherwise,
-        it method will return a biased projection to the model_final space, biased
-        to give more weight on parts of the feature space where the instrument is strong.
-
-    discrete_instrument: bool, optional, default False
-        Whether the instrument values should be treated as categorical, rather than continuous, quantities
-
-    discrete_treatment: bool, optional, default False
-        Whether the treatment values should be treated as categorical, rather than continuous, quantities
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable, optional, default 2
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, *,
-                 model_final,
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 cov_clip=0.1,
-                 opt_reweighted=False,
-                 discrete_instrument=False,
-                 discrete_treatment=False,
-                 categories='auto',
-                 cv=2,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 random_state=None):
-        self.model_final = clone(model_final, safe=False)
-        self.featurizer = clone(featurizer, safe=False)
-        self.fit_cate_intercept = fit_cate_intercept
-        self.cov_clip = cov_clip
-        self.opt_reweighted = opt_reweighted
-        super().__init__(discrete_instrument=discrete_instrument,
-                         discrete_treatment=discrete_treatment,
-                         categories=categories,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         random_state=random_state)
-
-    def _gen_model_final(self):
-        return clone(self.model_final, safe=False)
-
-    def _gen_ortho_learner_model_final(self):
-        return _BaseDRIVModelFinal(self._gen_model_final(),
-                                   clone(self.featurizer, safe=False),
-                                   self.discrete_treatment,
-                                   self.discrete_instrument,
-                                   self.fit_cate_intercept,
-                                   self.cov_clip,
-                                   self.opt_reweighted)
-
-    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: (n, d_z) matrix
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string,:class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of:class:`.BootstrapInference`).
-
-        Returns
-        -------
-        self: _BaseDRIV instance
-        """
-        # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring
-        return super().fit(Y, T, X=X, W=W, Z=Z,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
-        """
-        Score the fitted CATE model on a new data set. Generates nuisance parameters
-        for the new data set based on the fitted nuisance models created at fit time.
-        It uses the mean prediction of the models fitted by the different crossfit folds.
-        Then calls the score function of the model_final and returns the calculated score.
-        The model_final model must have a score method.
-
-        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: (n, d_z) matrix or None (Default=None)
-            Instruments for each sample
-        X: optional (n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-
-        Returns
-        -------
-        score : float or (array of float)
-            The score of the final CATE model on the new data. Same type as the return
-            type of the model_final.score method.
-        """
-        # Replacing score from _OrthoLearner, to reorder arguments and improve the docstring
-        return super().score(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight)
-
-    @property
-    def original_featurizer(self):
-        return self.ortho_learner_model_final_._original_featurizer
-
-    @property
-    def featurizer_(self):
-        # NOTE This is used by the inference methods and has to be the overall featurizer. intended
-        # for internal use by the library
-        return self.ortho_learner_model_final_._featurizer
-
-    @property
-    def model_final_(self):
-        # NOTE This is used by the inference methods and is more for internal use to the library
-        return self.ortho_learner_model_final_._model_final
-
-    def cate_feature_names(self, feature_names=None):
-        """
-        Get the output feature names.
-
-        Parameters
-        ----------
-        feature_names: list of strings of length X.shape[1] or None
-            The names of the input features. If None and X is a dataframe, it defaults to the column names
-            from the dataframe.
-
-        Returns
-        -------
-        out_feature_names: list of strings or None
-            The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
-            final constant marginal CATE model is linear. It is the names of the features that are associated
-            with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
-            does not have a method: `get_feature_names(feature_names)`. Otherwise None is returned.
-        """
-        if feature_names is None:
-            feature_names = self._input_names["feature_names"]
-        if self.original_featurizer is None:
-            return feature_names
-        elif hasattr(self.original_featurizer, 'get_feature_names'):
-            return self.original_featurizer.get_feature_names(feature_names)
-        else:
-            raise AttributeError("Featurizer does not have a method: get_feature_names!")
-
-
-class _IntentToTreatDRIVModelNuisance:
-    """
-    Nuisance model fits the three models at fit time and at predict time
-    returns :math:`Y-\\E[Y|X]` and :math:`\\E[T|X,Z]-\\E[T|X]` as residuals.
-    """
-
-    def __init__(self, model_Y_X, model_T_XZ, prel_model_effect):
-        self._model_Y_X = clone(model_Y_X, safe=False)
-        self._model_T_XZ = clone(model_T_XZ, safe=False)
-        self._prel_model_effect = clone(prel_model_effect, safe=False)
-
-    def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        self._model_Y_X.fit(X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_T_XZ.fit(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight, groups=groups)
-        # we need to undo the one-hot encoding for calling effect,
-        # since it expects raw values
-        self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W,
-                                    sample_weight=sample_weight, groups=groups)
-        return self
-
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        if hasattr(self._model_Y_X, 'score'):
-            Y_X_score = self._model_Y_X.score(X=X, W=W, Target=Y, sample_weight=sample_weight)
-        else:
-            Y_X_score = None
-        if hasattr(self._model_T_XZ, 'score'):
-            T_XZ_score = self._model_T_XZ.score(X=X, W=W, Z=Z, Target=T, sample_weight=sample_weight)
-        else:
-            T_XZ_score = None
-        if hasattr(self._prel_model_effect, 'score'):
-            # we need to undo the one-hot encoding for calling effect,
-            # since it expects raw values
-            effect_score = self._prel_model_effect.score(Y, inverse_onehot(T),
-                                                         Z=inverse_onehot(Z), X=X, W=W, sample_weight=sample_weight)
-        else:
-            effect_score = None
-
-        return Y_X_score, T_XZ_score, effect_score
-
-    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
-        Y_pred = self._model_Y_X.predict(X, W)
-        T_pred_zero = self._model_T_XZ.predict(X, W, np.zeros(Z.shape))
-        T_pred_one = self._model_T_XZ.predict(X, W, np.ones(Z.shape))
-        delta = (T_pred_one - T_pred_zero) / 2
-        T_pred_mean = (T_pred_one + T_pred_zero) / 2
-        prel_theta = self._prel_model_effect.effect(X)
-        if X is None:  # In this case predict above returns a single row
-            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
-            prel_theta = np.tile(prel_theta.reshape(1, -1), (T.shape[0], 1))
-        Y_res = Y - Y_pred.reshape(Y.shape)
-        T_res = T - T_pred_mean.reshape(T.shape)
-        return prel_theta, T_res, Y_res, 2 * Z - 1, delta
-
-
-class _IntentToTreatDRIV(_BaseDRIV):
-    """
-    Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
-    """
-
-    def __init__(self, *,
-                 model_Y_X,
-                 model_T_XZ,
-                 prel_model_effect,
-                 model_final,
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 cov_clip=.1,
-                 cv=3,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 opt_reweighted=False,
-                 categories='auto',
-                 random_state=None):
-        """
-        """
-        self.model_Y_X = clone(model_Y_X, safe=False)
-        self.model_T_XZ = clone(model_T_XZ, safe=False)
-        self.prel_model_effect = clone(prel_model_effect, safe=False)
-        # TODO: check that Y, T, Z do not have multiple columns
-        super().__init__(model_final=model_final,
-                         featurizer=featurizer,
-                         fit_cate_intercept=fit_cate_intercept,
-                         cov_clip=cov_clip,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         discrete_instrument=True,
-                         discrete_treatment=True,
-                         categories=categories,
-                         opt_reweighted=opt_reweighted,
-                         random_state=random_state)
-
-    def _gen_prel_model_effect(self):
-        return clone(self.prel_model_effect, safe=False)
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False),
-                                                                  discrete_target=False),
-                                               _FirstStageWrapper(clone(self.model_T_XZ, safe=False),
-                                                                  discrete_target=True),
-                                               self._gen_prel_model_effect())
-
-
-class _DummyCATE:
-    """
-    A dummy cate effect model that always returns zero effect
-    """
-
-    def __init__(self):
-        return
-
-    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None, **kwargs):
-        return self
-
-    def effect(self, X):
-        if X is None:
-            return np.zeros(1)
-        return np.zeros(X.shape[0])
-
-
-class IntentToTreatDRIV(_IntentToTreatDRIV):
-    """
-    Implements the DRIV algorithm for the intent-to-treat A/B test setting
-
-    Parameters
-    ----------
-    model_Y_X : estimator
-        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
-
-    model_T_XZ : estimator
-        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
-
-    flexible_model_effect : estimator
-        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
-
-    final_model_effect : estimator, optional
-        a final model for the CATE and projections. If None, then flexible_model_effect is also used as a final model
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    cov_clip : float, optional, default 0.1
-        clipping of the covariate for regions with low "overlap", to reduce variance
-
-    cv: int, cross-validation generator or an iterable, optional, default 3
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    opt_reweighted : bool, optional, default False
-        Whether to reweight the samples to minimize variance. If True then
-        final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
-        utilities can be used for any linear model to enable sample_weights). If True then
-        assumes the final_model_effect is flexible enough to fit the true CATE model. Otherwise,
-        it method will return a biased projection to the model_effect space, biased
-        to give more weight on parts of the feature space where the instrument is strong.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, *,
-                 model_Y_X,
-                 model_T_XZ,
-                 flexible_model_effect,
-                 model_final=None,
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 cov_clip=.1,
-                 cv=3,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 opt_reweighted=False,
-                 categories='auto',
-                 random_state=None):
-        self.flexible_model_effect = clone(flexible_model_effect, safe=False)
-        super().__init__(model_Y_X=model_Y_X,
-                         model_T_XZ=model_T_XZ,
-                         prel_model_effect=None,
-                         model_final=model_final,
-                         featurizer=featurizer,
-                         fit_cate_intercept=fit_cate_intercept,
-                         cov_clip=cov_clip,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         opt_reweighted=opt_reweighted,
-                         categories=categories,
-                         random_state=random_state)
-
-    def _gen_model_final(self):
-        if self.model_final is None:
-            return clone(self.flexible_model_effect, safe=False)
-        return clone(self.model_final, safe=False)
-
-    def _gen_prel_model_effect(self):
-        return _IntentToTreatDRIV(model_Y_X=clone(self.model_Y_X, safe=False),
-                                  model_T_XZ=clone(self.model_T_XZ, safe=False),
-                                  prel_model_effect=_DummyCATE(),
-                                  model_final=clone(self.flexible_model_effect, safe=False),
-                                  cov_clip=1e-7,
-                                  cv=1,
-                                  opt_reweighted=True,
-                                  random_state=self.random_state)
-
-    @property
-    def models_Y_X(self):
-        return [mdl._model_Y_X._model for mdl in super().models_nuisance_]
-
-    @property
-    def models_T_XZ(self):
-        return [mdl._model_T_XZ._model for mdl in super().models_nuisance_]
-
-    @property
-    def nuisance_scores_Y_X(self):
-        return self.nuisance_scores_[0]
-
-    @property
-    def nuisance_scores_T_XZ(self):
-        return self.nuisance_scores_[1]
-
-    @property
-    def nuisance_scores_effect(self):
-        return self.nuisance_scores_[2]
-
-    @property
-    def prel_model_effect(self):
-        return self._gen_prel_model_effect()
-
-    @prel_model_effect.setter
-    def prel_model_effect(self, value):
-        if value is not None:
-            raise ValueError("Parameter `prel_model_effect` cannot be altered for this estimator.")
-
-
-class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
-    """
-    Implements the DRIV algorithm for the intent-to-treat A/B test setting
-
-    Parameters
-    ----------
-    model_Y_X : estimator
-        model to estimate :math:`\\E[Y | X]`.  Must support `fit` and `predict` methods.
-
-    model_T_XZ : estimator
-        model to estimate :math:`\\E[T | X, Z]`.  Must support `fit` and `predict_proba` methods.
-
-    flexible_model_effect : estimator
-        a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
-
-    featurizer : :term:`transformer`, optional, default None
-        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
-        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
-        If featurizer=None, then CATE is trained on X.
-
-    fit_cate_intercept : bool, optional, default True
-        Whether the linear CATE model should have a constant term.
-
-    cov_clip : float, optional, default 0.1
-        clipping of the covariate for regions with low "overlap", to reduce variance
-
-    cv: int, cross-validation generator or an iterable, optional, default 3
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    mc_iters: int, optional (default=None)
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, optional (default='mean')
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    categories: 'auto' or list, default 'auto'
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-    """
-
-    def __init__(self, *,
-                 model_Y_X,
-                 model_T_XZ,
-                 flexible_model_effect,
-                 featurizer=None,
-                 fit_cate_intercept=True,
-                 cov_clip=.1,
-                 cv=3,
-                 n_splits='raise',
-                 mc_iters=None,
-                 mc_agg='mean',
-                 categories='auto',
-                 random_state=None):
-        super().__init__(model_Y_X=model_Y_X,
-                         model_T_XZ=model_T_XZ,
-                         flexible_model_effect=flexible_model_effect,
-                         featurizer=featurizer,
-                         fit_cate_intercept=fit_cate_intercept,
-                         model_final=None,
-                         cov_clip=cov_clip,
-                         cv=cv,
-                         n_splits=n_splits,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         opt_reweighted=False,
-                         categories=categories, random_state=random_state)
-
-    def _gen_model_final(self):
-        return StatsModelsLinearRegression(fit_intercept=False)
-
-    # override only so that we can update the docstring to indicate support for `StatsModelsInference`
-    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference='auto'):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        Z: (n, d_z) matrix or vector of length n
-            Instruments for each sample
-        X: optional(n, d_x) matrix or None (Default=None)
-            Features for each sample
-        W: optional(n, d_w) matrix or None (Default=None)
-            Controls for each sample
-        sample_weight: optional(n,) vector or None (Default=None)
-            Weights for each samples
-        sample_var: optional(n,) vector or None (Default=None)
-            Sample variance for each sample
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: string,:class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
-            (or an instance of :class:`.StatsModelsInference`).
-
-        Returns
-        -------
-        self : instance
-        """
-        return super().fit(Y, T, Z=Z, X=X, W=W,
-                           sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values, inference=inference)
-
-    def refit_final(self, *, inference='auto'):
-        return super().refit_final(inference=inference)
-    refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
-
-    @property
-    def bias_part_of_coef(self):
-        return self.ortho_learner_model_final_._fit_cate_intercept
-
-    @property
-    def fit_cate_intercept_(self):
-        return self.ortho_learner_model_final_._fit_cate_intercept
-
-    @property
-    def model_final(self):
-        return self._gen_model_final()
-
-    @model_final.setter
-    def model_final(self, value):
-        if value is not None:
-            raise ValueError("Parameter `model_final` cannot be altered for this estimator.")
-
-    @property
-    def opt_reweighted(self):
-        return False
-
-    @opt_reweighted.setter
-    def opt_reweighted(self, value):
-        if not (value is False):
-            raise ValueError("Parameter `value` cannot be altered from `False` for this estimator.")
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.iv.dml as dmliv
+import econml.iv.dr as driv
+from .utilities import deprecated
+
+
+@deprecated("The econml.ortho_iv.DMLATEIV class has been moved to econml.iv.dml.DMLATEIV; "
+            "an upcoming release will remove support for the old name")
+class DMLATEIV(dmliv.DMLATEIV):
+    pass
+
+
+@deprecated("The econml.ortho_iv.ProjectedDMLATEIV class has been moved to econml.iv.dml.ProjectedDMLATEIV; "
+            "an upcoming release will remove support for the old name")
+class ProjectedDMLATEIV(dmliv.ProjectedDMLATEIV):
+    pass
+
+
+@deprecated("The econml.ortho_iv.DMLIV class has been moved to econml.iv.dml.DMLIV; "
+            "an upcoming release will remove support for the old name")
+class DMLIV(dmliv.DMLIV):
+    pass
+
+
+@deprecated("The econml.ortho_iv.NonParamDMLIV class has been moved to econml.iv.dml.NonParamDMLIV; "
+            "an upcoming release will remove support for the old name")
+class NonParamDMLIV(dmliv.NonParamDMLIV):
+    pass
+
+
+@deprecated("The econml.ortho_iv.IntentToTreatDRIV class has been moved to econml.iv.dr.IntentToTreatDRIV; "
+            "an upcoming release will remove support for the old name")
+class IntentToTreatDRIV(driv.IntentToTreatDRIV):
+    pass
+
+
+@deprecated("The econml.ortho_iv.LinearIntentToTreatDRIV class has been moved "
+            "to econml.iv.dr.LinearIntentToTreatDRIV; "
+            "an upcoming release will remove support for the old name")
+class LinearIntentToTreatDRIV(driv.LinearIntentToTreatDRIV):
+    pass
diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py
index 6fe6311f..de63bbf1 100644
--- a/econml/sklearn_extensions/linear_model.py
+++ b/econml/sklearn_extensions/linear_model.py
@@ -3,8 +3,7 @@
 
 """Collection of scikit-learn extensions for linear models.
 
-.. testcode::
-    :hide:
+.. testsetup::
 
     # Our classes that derive from sklearn ones sometimes include
     # inherited docstrings that have embedded doctests; we need the following imports
@@ -199,6 +198,7 @@ class WeightedLasso(WeightedModelMixin, Lasso):
     n_iter_ : int | array-like, shape (n_targets,)
         number of iterations run by the coordinate descent solver to reach
         the specified tolerance.
+
     """
 
     def __init__(self, alpha=1.0, fit_intercept=True,
@@ -294,6 +294,7 @@ class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
     n_iter_ : int | array-like, shape (n_targets,)
         number of iterations run by the coordinate descent solver to reach
         the specified tolerance.
+
     """
 
     def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
@@ -326,6 +327,11 @@ class WeightedMultiTaskLasso(WeightedModelMixin, MultiTaskLasso):
 class WeightedLassoCV(WeightedModelMixin, LassoCV):
     """Version of sklearn LassoCV that accepts weights.
 
+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
     Parameters
     ----------
     eps : float, optional
@@ -397,6 +403,7 @@ class WeightedLassoCV(WeightedModelMixin, LassoCV):
         rather than looping over features sequentially by default. This
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
+
     """
 
     def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
@@ -438,6 +445,11 @@ class WeightedLassoCV(WeightedModelMixin, LassoCV):
 class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
     """Version of sklearn MultiTaskLassoCV that accepts weights.
 
+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
     Parameters
     ----------
     eps : float, optional
@@ -502,6 +514,7 @@ class WeightedMultiTaskLassoCV(WeightedModelMixin, MultiTaskLassoCV):
         rather than looping over features sequentially by default. This
         (setting to 'random') often leads to significantly faster convergence
         especially when tol is higher than 1e-4.
+
     """
 
     def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
@@ -575,6 +588,11 @@ class DebiasedLasso(WeightedLasso):
 
     Only implemented for single-dimensional output.
 
+    .. testsetup::
+
+        import numpy as np
+        from sklearn.linear_model import lasso_path
+
     Parameters
     ----------
     alpha : string | float, optional, default 'auto'.
diff --git a/econml/tests/test_automated_ml.py b/econml/tests/test_automated_ml.py
index 16ab5c3e..3c3f65f2 100644
--- a/econml/tests/test_automated_ml.py
+++ b/econml/tests/test_automated_ml.py
@@ -11,7 +11,7 @@ from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, \
 from sklearn.model_selection import KFold
 from econml.dml import *
 from econml.metalearners import *
-from econml.drlearner import DRLearner
+from econml.dr import DRLearner
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, \
     cross_product
diff --git a/econml/tests/test_bootstrap.py b/econml/tests/test_bootstrap.py
index 81c33fd1..c410ebc6 100644
--- a/econml/tests/test_bootstrap.py
+++ b/econml/tests/test_bootstrap.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from econml.bootstrap import BootstrapEstimator
+from econml.inference._bootstrap import BootstrapEstimator
 from econml.inference import BootstrapInference
 from econml.dml import LinearDML
-from econml.ortho_iv import LinearIntentToTreatDRIV
-from econml.two_stage_least_squares import NonparametricTwoStageLeastSquares
+from econml.iv.dr import LinearIntentToTreatDRIV
+from econml.iv.sieve import SieveTSLS
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.preprocessing import PolynomialFeatures
 import numpy as np
@@ -235,10 +235,10 @@ class TestBootstrap(unittest.TestCase):
 
         opts = BootstrapInference(50, 2)
 
-        est = NonparametricTwoStageLeastSquares(t_featurizer=PolynomialFeatures(2),
-                                                x_featurizer=PolynomialFeatures(2),
-                                                z_featurizer=PolynomialFeatures(2),
-                                                dt_featurizer=None)
+        est = SieveTSLS(t_featurizer=PolynomialFeatures(2),
+                        x_featurizer=PolynomialFeatures(2),
+                        z_featurizer=PolynomialFeatures(2),
+                        dt_featurizer=None)
         est.fit(y, t, X=x, W=None, Z=z, inference=opts)
 
         # test that we can get an interval for the same attribute for the bootstrap as the original,
@@ -274,7 +274,7 @@ class TestBootstrap(unittest.TestCase):
         Y = [1, 2, 3, 4, 5, 6]
         X = np.array([1, 1, 2, 2, 1, 2]).reshape(-1, 1)
         est = LinearDML(model_y=LinearRegression(), model_t=LogisticRegression(), discrete_treatment=True)
-        inference = BootstrapInference(n_bootstrap_samples=5)
+        inference = BootstrapInference(n_bootstrap_samples=5, n_jobs=-1, verbose=0)
         est.fit(Y, T, inference=inference)
         est.const_marginal_effect_interval()
 
@@ -292,7 +292,7 @@ class TestBootstrap(unittest.TestCase):
         X = np.array([1, 1, 2, 2, 1, 2, 1, 2]).reshape(-1, 1)
         est = LinearIntentToTreatDRIV(model_Y_X=LinearRegression(), model_T_XZ=LogisticRegression(),
                                       flexible_model_effect=LinearRegression(), cv=2)
-        inference = BootstrapInference(n_bootstrap_samples=20)
+        inference = BootstrapInference(n_bootstrap_samples=20, n_jobs=-1, verbose=3)
         est.fit(Y, T, Z=Z, X=X, inference=inference)
         est.const_marginal_effect_interval(X)
 
@@ -303,7 +303,7 @@ class TestBootstrap(unittest.TestCase):
         est = LinearDML(cv=2)
         for kind in ['percentile', 'pivot', 'normal']:
             with self.subTest(kind=kind):
-                inference = BootstrapInference(n_bootstrap_samples=5, bootstrap_type=kind)
+                inference = BootstrapInference(n_bootstrap_samples=5, n_jobs=-1, verbose=0, bootstrap_type=kind)
                 est.fit(Y, T, inference=inference)
                 i = est.const_marginal_effect_interval()
                 inf = est.const_marginal_effect_inference()
diff --git a/econml/tests/test_deepiv.py b/econml/tests/test_deepiv.py
index 968ead6a..4bc55e27 100644
--- a/econml/tests/test_deepiv.py
+++ b/econml/tests/test_deepiv.py
@@ -13,8 +13,9 @@ import keras.backend as K
 
 import pytest
 
-from econml.deepiv import _zero_grad
-from econml.deepiv import mog_model, mog_loss_model, mog_sample_model, response_loss_model, DeepIV
+from econml.iv.nnet._deepiv import _zero_grad
+from econml.iv.nnet import DeepIV
+from econml.iv.nnet._deepiv import mog_model, mog_loss_model, mog_sample_model, response_loss_model
 from econml.utilities import reshape
 
 
diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py
index e8b6a13d..7b56cdfd 100644
--- a/econml/tests/test_drlearner.py
+++ b/econml/tests/test_drlearner.py
@@ -13,7 +13,7 @@ from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 from sklearn.model_selection import KFold, GroupKFold
 from sklearn.preprocessing import PolynomialFeatures
-from econml.drlearner import DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner
+from econml.dr import DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference, StatsModelsInferenceDiscrete
 from contextlib import ExitStack
diff --git a/econml/tests/test_inference.py b/econml/tests/test_inference.py
index cc4f6e93..73ef5901 100644
--- a/econml/tests/test_inference.py
+++ b/econml/tests/test_inference.py
@@ -8,7 +8,7 @@ from sklearn.base import clone
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
 from econml.dml import LinearDML, DML, NonParamDML
-from econml.drlearner import LinearDRLearner, DRLearner
+from econml.dr import LinearDRLearner, DRLearner
 from econml.inference import (BootstrapInference, NormalInferenceResults,
                               EmpiricalInferenceResults, PopulationSummaryResults)
 from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression, DebiasedLasso
diff --git a/econml/tests/test_montecarlo.py b/econml/tests/test_montecarlo.py
index 8750eec2..323f4bd4 100644
--- a/econml/tests/test_montecarlo.py
+++ b/econml/tests/test_montecarlo.py
@@ -6,9 +6,9 @@
 import unittest
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from econml.dml import (DML, LinearDML, SparseLinearDML, KernelDML, NonParamDML, ForestDML)
-from econml.drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.dr import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np
 
 
diff --git a/econml/tests/test_orf.py b/econml/tests/test_orf.py
index 9f4aa872..df9b4987 100644
--- a/econml/tests/test_orf.py
+++ b/econml/tests/test_orf.py
@@ -10,7 +10,7 @@ from sklearn.exceptions import DataConversionWarning
 from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.pipeline import Pipeline
-from econml.ortho_forest import DMLOrthoForest, DROrthoForest
+from econml.orf import DMLOrthoForest, DROrthoForest
 from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper
 
 
diff --git a/econml/tests/test_ortho_iv.py b/econml/tests/test_ortho_iv.py
index 280b6e9e..ed95a1b3 100644
--- a/econml/tests/test_ortho_iv.py
+++ b/econml/tests/test_ortho_iv.py
@@ -8,8 +8,8 @@ from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, PolynomialFeatures
 from sklearn.model_selection import KFold
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference
diff --git a/econml/tests/test_random_state.py b/econml/tests/test_random_state.py
index f90f1941..a9ccdd98 100644
--- a/econml/tests/test_random_state.py
+++ b/econml/tests/test_random_state.py
@@ -7,9 +7,9 @@ from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, Polynomial
 from sklearn.model_selection import KFold, GroupKFold
 from econml.dml import DML, LinearDML, SparseLinearDML, KernelDML
 from econml.dml import NonParamDML, CausalForestDML
-from econml.drlearner import DRLearner, SparseLinearDRLearner, LinearDRLearner, ForestDRLearner
-from econml.ortho_iv import DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,\
-    IntentToTreatDRIV, LinearIntentToTreatDRIV
+from econml.dr import DRLearner, SparseLinearDRLearner, LinearDRLearner, ForestDRLearner
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 import numpy as np
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 from econml.inference import BootstrapInference
diff --git a/econml/tests/test_refit.py b/econml/tests/test_refit.py
index 1a8e6220..5e0526b0 100644
--- a/econml/tests/test_refit.py
+++ b/econml/tests/test_refit.py
@@ -6,9 +6,9 @@ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import Pipeline
 from econml.dml import (DML, LinearDML, SparseLinearDML, KernelDML, NonParamDML, ForestDML)
-from econml.drlearner import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
-from econml.ortho_iv import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV,
-                             IntentToTreatDRIV, LinearIntentToTreatDRIV)
+from econml.dr import (DRLearner, LinearDRLearner, SparseLinearDRLearner, ForestDRLearner)
+from econml.iv.dml import (DMLATEIV, ProjectedDMLATEIV, DMLIV, NonParamDMLIV)
+from econml.iv.dr import (IntentToTreatDRIV, LinearIntentToTreatDRIV)
 from econml.sklearn_extensions.linear_model import (DebiasedLasso, WeightedLasso,
                                                     StatsModelsRLM, StatsModelsLinearRegression)
 from econml.inference import NormalInferenceResults, BootstrapInference
diff --git a/econml/tests/test_rscorer.py b/econml/tests/test_rscorer.py
index 6dc6f255..93f4779a 100644
--- a/econml/tests/test_rscorer.py
+++ b/econml/tests/test_rscorer.py
@@ -11,7 +11,7 @@ from joblib import Parallel, delayed
 
 from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
 from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
-from econml.drlearner import DRLearner
+from econml.dr import DRLearner
 from econml.score import RScorer
 
 
diff --git a/econml/tests/test_shap.py b/econml/tests/test_shap.py
index 7eeffa3e..13f1e619 100644
--- a/econml/tests/test_shap.py
+++ b/econml/tests/test_shap.py
@@ -6,10 +6,10 @@ import unittest
 import shap
 from shap.plots import scatter, heatmap, bar, beeswarm, waterfall, force
 from econml.dml import *
-from econml.ortho_forest import *
-from econml.drlearner import *
+from econml.orf import *
+from econml.dr import *
 from econml.metalearners import *
-from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.preprocessing import PolynomialFeatures
 
diff --git a/econml/tests/test_two_stage_least_squares.py b/econml/tests/test_two_stage_least_squares.py
index b2002f38..dab526c4 100644
--- a/econml/tests/test_two_stage_least_squares.py
+++ b/econml/tests/test_two_stage_least_squares.py
@@ -9,7 +9,7 @@ import warnings
 import pytest
 
 from econml.utilities import shape, reshape
-from econml.two_stage_least_squares import (NonparametricTwoStageLeastSquares, HermiteFeatures, DPolynomialFeatures)
+from econml.iv.sieve import (SieveTSLS, HermiteFeatures, DPolynomialFeatures)
 from sklearn.linear_model import LinearRegression
 from sklearn.preprocessing import PolynomialFeatures
 
@@ -73,7 +73,7 @@ class Test2SLS(unittest.TestCase):
                         d_w = 1
                         if d_z >= n_t:
                             T, Y, X, Z, W = [make_random(d) for d in [d_t, d_y, d_x, d_z, d_w]]
-                            est = NonparametricTwoStageLeastSquares(
+                            est = SieveTSLS(
                                 t_featurizer=PolynomialFeatures(),
                                 x_featurizer=PolynomialFeatures(),
                                 z_featurizer=PolynomialFeatures(),
@@ -100,7 +100,7 @@ class Test2SLS(unittest.TestCase):
         T = np.hstack([np.cross(X, Z).reshape(-1, 1) + W, (np.prod(X, axis=1) + np.prod(Z, axis=1)).reshape(-1, 1)])
         Y = X * T + X**2
 
-        est = NonparametricTwoStageLeastSquares(
+        est = SieveTSLS(
             t_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
             x_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
             z_featurizer=PolynomialFeatures(degree=2, interaction_only=False, include_bias=True),
@@ -149,10 +149,10 @@ class Test2SLS(unittest.TestCase):
         p_fresh = x_fresh + z_fresh * e_fresh + np.random.uniform(size=(n, d_t))
 
         for (dt, dx, dz) in [(0, 0, 0), (1, 1, 1), (5, 5, 5), (10, 10, 10), (3, 3, 10), (10, 10, 3)]:
-            np2sls = NonparametricTwoStageLeastSquares(t_featurizer=HermiteFeatures(dt),
-                                                       x_featurizer=HermiteFeatures(dx),
-                                                       z_featurizer=HermiteFeatures(dz),
-                                                       dt_featurizer=HermiteFeatures(dt, shift=1))
+            np2sls = SieveTSLS(t_featurizer=HermiteFeatures(dt),
+                               x_featurizer=HermiteFeatures(dx),
+                               z_featurizer=HermiteFeatures(dz),
+                               dt_featurizer=HermiteFeatures(dt, shift=1))
             np2sls.fit(y, p, X=x, W=w, Z=z)
             effect = np2sls.effect(x_fresh, np.zeros(shape(p_fresh)), p_fresh)
             losses.append(np.mean(np.square(p_fresh * x_fresh - effect)))
diff --git a/econml/two_stage_least_squares.py b/econml/two_stage_least_squares.py
index 8d260b54..b28ae7a5 100644
--- a/econml/two_stage_least_squares.py
+++ b/econml/two_stage_least_squares.py
@@ -1,362 +1,26 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""Provides a non-parametric two-stage least squares instrumental variable estimator."""
-
-import numpy as np
-from copy import deepcopy
-from sklearn import clone
-from sklearn.linear_model import LinearRegression
-from .utilities import shape, transpose, reshape, cross_product, ndim, size,\
-    _deprecate_positional, check_input_arrays
-from ._cate_estimator import BaseCateEstimator, LinearCateEstimator
-from numpy.polynomial.hermite_e import hermeval
-from sklearn.base import TransformerMixin
-from sklearn.preprocessing import PolynomialFeatures
-from itertools import product
-
-
-class HermiteFeatures(TransformerMixin):
-    """
-    Featurizer that returns(unscaled) Hermite function evaluations.
-
-    The evaluated functions are of degrees 0..`degree`, differentiated `shift` times.
-
-    If the input has shape(n, x) and `joint` is False, the output will have shape(n, (`degree`+ 1)×x) if `shift` is 0.
-    If the input has shape(n, x) and `joint` is True, the output will have shape(n, (`degree`+ 1) ^ x) if `shift` is 0.
-    In either case, if `shift` is nonzero there will be `shift` additional dimensions of size x
-    between the first and last.
-    """
-
-    def __init__(self, degree, shift=0, joint=False):
-        self._degree = degree
-        self._shift = shift
-        self._joint = joint
-
-    def _column_feats(self, X, shift):
-        """
-        Apply Hermite function evaluations of degrees 0..`degree` differentiated `shift` times.
-
-        When applied to the column `X` of shape(n,), the resulting array has shape(n, (degree + 1)).
-        """
-        assert ndim(X) == 1
-        # this will have dimension (d,) + shape(X)
-        coeffs = np.identity(self._degree + shift + 1)[:, shift:]
-        feats = ((-1) ** shift) * hermeval(X, coeffs) * np.exp(-X * X / 2)
-        # send the first dimension to the end
-        return transpose(feats)
-
-    def fit(self, X):
-        """Fits the data(a NOP for this class) and returns self."""
-        return self
-
-    def transform(self, X):
-        """
-        Transform the data by applying the appropriate Hermite functions.
-
-        Parameters
-        ----------
-        X: array_like
-            2-dimensional array of input features
-
-        Returns
-        -------
-        The transformed data
-        """
-        assert ndim(X) == 2
-        n = shape(X)[0]
-        ncols = shape(X)[1]
-        columns = []
-        for indices in product(*[range(ncols) for i in range(self._shift)]):
-            if self._joint:
-                columns.append(cross_product(*[self._column_feats(X[:, i], indices.count(i))
-                                               for i in range(shape(X)[1])]))
-            else:
-                indices = set(indices)
-                if self._shift == 0:  # return features for all columns:
-                    columns.append(np.hstack([self._column_feats(X[:, i], self._shift) for i in range(shape(X)[1])]))
-                # columns are featurized independently; partial derivatives are only non-zero
-                # when taken with respect to the same column each time
-                elif len(indices) == 1:
-                    index = list(indices)[0]
-                    feats = self._column_feats(X[:, index], self._shift)
-                    columns.append(np.hstack([feats if i == index else np.zeros(shape(feats))
-                                              for i in range(shape(X)[1])]))
-                else:
-                    columns.append(np.zeros((n, (self._degree + 1) * ncols)))
-        return reshape(np.hstack(columns), (n,) + (ncols,) * self._shift + (-1,))
-
-
-class DPolynomialFeatures(TransformerMixin):
-    """
-    Featurizer that returns the derivatives of :class:`~sklearn.preprocessing.PolynomialFeatures` features in
-    a way that's compativle with the expectations of :class:`.NonparametricTwoStageLeastSquares`'s
-    `dt_featurizer` parameter.
-
-    If the input has shape `(n, x)` and
-    :meth:`PolynomialFeatures.transform<sklearn.preprocessing.PolynomialFeatures.transform>` returns an output
-    of shape `(n, f)`, then :meth:`.transform` will return an array of shape `(n, x, f)`.
-
-    Parameters
-    ----------
-    degree: integer, default = 2
-        The degree of the polynomial features.
-
-    interaction_only: boolean, default = False
-        If true, only derivatives of interaction features are produced: features that are products of at most degree
-        distinct input features (so not `x[1] ** 2`, `x[0] * x[2] ** 3`, etc.).
-
-    include_bias: boolean, default = True
-        If True (default), then include the derivative of a bias column, the feature in which all polynomial powers
-        are zero.
-    """
-
-    def __init__(self, degree=2, interaction_only=False, include_bias=True):
-        self.F = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=include_bias)
-
-    def fit(self, X, y=None):
-        """
-        Compute number of output features.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            The data.
-        y : array, optional
-            Not used
-
-        Returns
-        -------
-        self : instance
-        """
-        return self
-
-    def transform(self, X):
-        """
-        Transform data to derivatives of polynomial features
-
-        Parameters
-        ----------
-        X: array-like, shape (n_samples, n_features)
-            The data to transform, row by row.
-
-        Returns
-        -------
-        XP: array-like, shape (n_samples, n_features, n_output_features)
-            The matrix of features, where `n_output_features` is the number of features that
-            would be returned from :class:`~sklearn.preprocessing.PolynomialFeatures`.
-        """
-        self.F.fit(X)
-        powers = self.F.powers_
-        result = np.zeros(X.shape + (self.F.n_output_features_,))
-        for i in range(X.shape[1]):
-            p = powers.copy()
-            c = powers[:, i]
-            p[:, i] -= 1
-            M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
-            result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
-        return result
-
-
-def _add_ones(arr):
-    """Add a column of ones to the front of an array."""
-    return np.hstack([np.ones((shape(arr)[0], 1)), arr])
-
-
-def _add_zeros(arr):
-    """Add a column of zeros to the front of an array."""
-    return np.hstack([np.zeros((shape(arr)[0], 1)), arr])
-
-
-class NonparametricTwoStageLeastSquares(BaseCateEstimator):
-    """
-    Non-parametric instrumental variables estimator.
-
-    Supports the use of arbitrary featurizers for the features, treatments, and instruments.
-
-    Parameters
-    ----------
-    t_featurizer: transformer
-        Featurizer used to transform the treatments
-
-    x_featurizer: transformer
-        Featurizer used to transform the raw features
-
-    z_featurizer: transformer
-        Featurizer used to transform the instruments
-
-    dt_featurizer: transformer
-        Featurizer used to transform the treatments for the computation of the marginal effect.
-        This should produce a 3-dimensional array, containing the per-treatment derivative of
-        each transformed treatment. That is, given a treatment array of shape(n, dₜ),
-        the output should have shape(n, dₜ, fₜ), where fₜ is the number of columns produced by `t_featurizer`.
-
-    """
-
-    def __init__(self, *,
-                 t_featurizer,
-                 x_featurizer,
-                 z_featurizer,
-                 dt_featurizer):
-        self._t_featurizer = clone(t_featurizer, safe=False)
-        self._x_featurizer = clone(x_featurizer, safe=False)
-        self._z_featurizer = clone(z_featurizer, safe=False)
-        self._dt_featurizer = clone(dt_featurizer, safe=False)
-        # don't fit intercept; manually add column of ones to the data instead;
-        # this allows us to ignore the intercept when computing marginal effects
-        self._model_T = LinearRegression(fit_intercept=False)
-        self._model_Y = LinearRegression(fit_intercept=False)
-        super().__init__()
-
-    @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
-                           "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    @BaseCateEstimator._wrap_fit
-    def fit(self, Y, T, X, W, Z, *, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates functions τ(·, ·, ·), ∂τ(·, ·).
-
-        Parameters
-        ----------
-        Y: (n × d_y) matrix
-            Outcomes for each sample
-        T: (n × dₜ) matrix
-            Treatments for each sample
-        X: optional(n × dₓ) matrix
-            Features for each sample
-        W: optional(n × d_w) matrix
-            Controls for each sample
-        Z: optional(n × d_z) matrix
-            Instruments for each sample
-        inference: string, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of :class:`.BootstrapInference`)
-
-        Returns
-        -------
-        self
-
-        """
-        Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z)
-        if X is None:
-            X = np.empty((shape(Y)[0], 0))
-        if W is None:
-            W = np.empty((shape(Y)[0], 0))
-        assert shape(Y)[0] == shape(T)[0] == shape(X)[0] == shape(W)[0] == shape(Z)[0]
-
-        # make T 2D if if was a vector
-        if ndim(T) == 1:
-            T = reshape(T, (-1, 1))
-
-        # store number of columns of W so that we can create correctly shaped zero array in effect and marginal effect
-        self._d_w = shape(W)[1]
-
-        # two stage approximation
-        # first, get basis expansions of T, X, and Z
-        ft_X = self._x_featurizer.fit_transform(X)
-        ft_Z = self._z_featurizer.fit_transform(Z)
-        ft_T = self._t_featurizer.fit_transform(T)
-        # TODO: is it right that the effective number of intruments is the
-        #       product of ft_X and ft_Z, not just ft_Z?
-        assert shape(ft_T)[1] <= shape(ft_X)[1] * shape(ft_Z)[1], ("There can be no more T features than the product "
-                                                                   "of the number of X and Z features; otherwise "
-                                                                   "there is not enough information to identify their "
-                                                                   "structure")
-
-        # regress T expansion on X,Z expansions concatenated with W
-        features = _add_ones(np.hstack([W, cross_product(ft_X, ft_Z)]))
-        self._model_T.fit(features, ft_T)
-        # predict ft_T from interacted ft_X, ft_Z
-        ft_T_hat = self._model_T.predict(features)
-        self._model_Y.fit(_add_ones(np.hstack([W, cross_product(ft_T_hat, ft_X)])), Y)
-
-    def effect(self, X=None, T0=0, T1=1):
-        """
-        Calculate the heterogeneous treatment effect τ(·,·,·).
-
-        The effect is calculated between the two treatment points
-        conditional on a vector of features on a set of m test samples {T0ᵢ, T1ᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T0: (m × dₜ) matrix or vector of length m
-            Base treatments for each sample
-        T1: (m × dₜ) matrix or vector of length m
-            Target treatments for each sample
-        X: optional (m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        τ: (m × d_y) matrix
-            Heterogeneous treatment effects on each outcome for each sample
-            Note that when Y is a vector rather than a 2-dimensional array, the corresponding
-            singleton dimension will be collapsed (so this method will return a vector)
-
-        """
-        if ndim(T0) == 0:
-            T0 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T0)
-        if ndim(T1) == 0:
-            T1 = np.full((1 if X is None else shape(X)[0],) + self._d_t, T1)
-        if ndim(T0) == 1:
-            T0 = reshape(T0, (-1, 1))
-        if ndim(T1) == 1:
-            T1 = reshape(T1, (-1, 1))
-        if X is None:
-            X = np.empty((shape(T0)[0], 0))
-        assert shape(T0) == shape(T1)
-        assert shape(T0)[0] == shape(X)[0]
-
-        W = np.zeros((shape(T0)[0], self._d_w))  # can set arbitrarily since values will cancel
-        ft_X = self._x_featurizer.transform(X)
-        ft_T0 = self._t_featurizer.transform(T0)
-        ft_T1 = self._t_featurizer.transform(T1)
-        Y0 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T0, ft_X)])))
-        Y1 = self._model_Y.predict(_add_ones(np.hstack([W, cross_product(ft_T1, ft_X)])))
-        return Y1 - Y0
-
-    def marginal_effect(self, T, X=None):
-        """
-        Calculate the heterogeneous marginal effect ∂τ(·, ·).
-
-        The marginal effect is calculated around a base treatment
-        point conditional on a vector of features on a set of m test samples {Tᵢ, Xᵢ}.
-
-        Parameters
-        ----------
-        T: (m × dₜ) matrix
-            Base treatments for each sample
-        X: optional(m × dₓ) matrix
-            Features for each sample
-
-        Returns
-        -------
-        grad_tau: (m × d_y × dₜ) array
-            Heterogeneous marginal effects on each outcome for each sample
-            Note that when Y or T is a vector rather than a 2-dimensional array,
-            the corresponding singleton dimensions in the output will be collapsed
-            (e.g. if both are vectors, then the output of this method will also be a vector)
-        """
-        if X is None:
-            X = np.empty((shape(T)[0], 0))
-        assert shape(T)[0] == shape(X)[0]
-
-        ft_X = self._x_featurizer.transform(X)
-        n = shape(T)[0]
-        dT = self._dt_featurizer.transform(T if ndim(T) == 2 else reshape(T, (-1, 1)))
-        W = np.zeros((size(T), self._d_w))
-        # dT should be an n×dₜ×fₜ array (but if T was a vector, or if there is only one feature,
-        # dT may be only 2-dimensional)
-        # promote dT to 3D if necessary (e.g. if T was a vector)
-        if ndim(dT) < 3:
-            dT = reshape(dT, (n, 1, shape(dT)[1]))
-
-        # reshape ft_X and dT to allow cross product (result has shape n×dₜ×fₜ×f_x)
-        features = reshape(ft_X, (n, 1, 1, -1)) * reshape(dT, shape(dT) + (1,))
-        features = transpose(features, [0, 1, 3, 2])  # swap last two dims to match cross_product
-        features = reshape(features, (size(T), -1))
-        output = self._model_Y.predict(_add_zeros(np.hstack([W, features])))
-        output = reshape(output, shape(T) + shape(output)[1:])
-        if ndim(output) == 3:
-            return transpose(output, (0, 2, 1))  # transpose trailing T and Y dims
-        else:
-            return output
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import econml.iv.sieve as sieve
+from .utilities import deprecated
+
+
+@deprecated("The econml.two_stage_least_squares.HermiteFeatures class has been moved "
+            "to econml.iv.sieve.HermiteFeatures; "
+            "an upcoming release will remove support for the old name")
+class HermiteFeatures(sieve.HermiteFeatures):
+    pass
+
+
+@deprecated("The econml.two_stage_least_squares.DPolynomialFeatures class has been moved "
+            "to econml.iv.sieve.DPolynomialFeatures; "
+            "an upcoming release will remove support for the old name")
+class DPolynomialFeatures(sieve.DPolynomialFeatures):
+    pass
+
+
+@deprecated("The econml.two_stage_least_squares.NonparametricTwoStageLeastSquares class has been moved "
+            "to econml.iv.sieve.SieveTSLS; "
+            "an upcoming release will remove support for the old name")
+class NonparametricTwoStageLeastSquares(sieve.SieveTSLS):
+    pass
diff --git a/monte_carlo_tests/monte_carlo_honestforest.py b/monte_carlo_tests/monte_carlo_honestforest.py
index ea5336e1..240105db 100644
--- a/monte_carlo_tests/monte_carlo_honestforest.py
+++ b/monte_carlo_tests/monte_carlo_honestforest.py
@@ -5,7 +5,7 @@ import time
 import argparse
 import warnings
 import joblib
-from econml.sklearn_extensions.ensemble import SubsampledHonestForest
+from econml.grf import RegressionForest
 
 
 def monte_carlo():
@@ -19,7 +19,7 @@ def monte_carlo():
         print(it)
         X = np.random.normal(0, 1, size=(n, d))
         y = X[:, 0] + np.random.normal(size=(n,))
-        est = SubsampledHonestForest(n_estimators=1000, verbose=1)
+        est = RegressionForest(n_estimators=1000, verbose=1)
         est.fit(X, y)
         point = est.predict(X_test)
         low, up = est.predict_interval(X_test, alpha=0.05)
@@ -47,5 +47,5 @@ def monte_carlo():
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Monte Carlo Coverage Tests for the SubsampledHonestForest')
+    parser = argparse.ArgumentParser(description='Monte Carlo Coverage Tests for the RegressionForest')
     monte_carlo()
diff --git a/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb b/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb
index 97b1459f..36712177 100644
--- a/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb	
+++ b/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb	
@@ -52,12 +52,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Main imports\n",
-    "from econml.ortho_forest import DMLOrthoForest, DROrthoForest\n",
+    "from econml.orf import DMLOrthoForest, DROrthoForest\n",
     "from econml.dml import CausalForestDML\n",
     "from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, WeightedLasso, WeightedLassoCV\n",
     "\n",
@@ -101,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -182,34 +182,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.6s\n",
-      "[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:   10.1s\n",
-      "[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:   13.9s\n",
-      "[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   14.6s finished\n",
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s\n",
-      "[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.4s\n",
-      "[Parallel(n_jobs=-1)]: Done 981 tasks      | elapsed:    5.4s\n",
-      "[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    5.4s finished\n"
+      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<econml.ortho_forest.DMLOrthoForest at 0x299345fc630>"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -218,19 +199,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.9s\n",
-      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.0s finished\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Calculate treatment effects\n",
     "treatment_effects = est.effect(X_test)"
@@ -238,20 +209,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s\n",
-      "[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    4.5s remaining:    0.7s\n",
-      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.5s finished\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Calculate default (90%) confidence intervals for the test data\n",
     "te_lower, te_upper = est.effect_interval(X_test, alpha=0.01)"
@@ -259,170 +219,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s\n",
-      "[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.8s finished\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "res = est.effect_inference(X_test)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>point_estimate</th>\n",
-       "      <th>stderr</th>\n",
-       "      <th>zstat</th>\n",
-       "      <th>pvalue</th>\n",
-       "      <th>ci_lower</th>\n",
-       "      <th>ci_upper</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1.161</td>\n",
-       "      <td>0.183</td>\n",
-       "      <td>6.339</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.860</td>\n",
-       "      <td>1.462</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1.171</td>\n",
-       "      <td>0.177</td>\n",
-       "      <td>6.628</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.881</td>\n",
-       "      <td>1.462</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1.182</td>\n",
-       "      <td>0.171</td>\n",
-       "      <td>6.925</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.901</td>\n",
-       "      <td>1.462</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1.192</td>\n",
-       "      <td>0.165</td>\n",
-       "      <td>7.228</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.921</td>\n",
-       "      <td>1.463</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1.202</td>\n",
-       "      <td>0.160</td>\n",
-       "      <td>7.533</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.940</td>\n",
-       "      <td>1.465</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   point_estimate  stderr  zstat  pvalue  ci_lower  ci_upper\n",
-       "0           1.161   0.183  6.339     0.0     0.860     1.462\n",
-       "1           1.171   0.177  6.628     0.0     0.881     1.462\n",
-       "2           1.182   0.171  6.925     0.0     0.901     1.462\n",
-       "3           1.192   0.165  7.228     0.0     0.921     1.463\n",
-       "4           1.202   0.160  7.533     0.0     0.940     1.465"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "res.summary_frame().head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"simpletable\">\n",
-       "<caption>Uncertainty of Mean Point Estimate</caption>\n",
-       "<tr>\n",
-       "  <th>mean_point</th> <th>stderr_mean</th> <th>zstat</th> <th>pvalue</th> <th>ci_mean_lower</th> <th>ci_mean_upper</th>\n",
-       "</tr>\n",
-       "<tr>\n",
-       "     <td>3.179</td>      <td>0.287</td>    <td>11.06</td>   <td>0.0</td>      <td>2.706</td>         <td>3.652</td>    \n",
-       "</tr>\n",
-       "</table>\n",
-       "<table class=\"simpletable\">\n",
-       "<caption>Distribution of Point Estimate</caption>\n",
-       "<tr>\n",
-       "  <th>std_point</th> <th>pct_point_lower</th> <th>pct_point_upper</th>\n",
-       "</tr>\n",
-       "<tr>\n",
-       "    <td>1.715</td>        <td>1.212</td>           <td>6.168</td>     \n",
-       "</tr>\n",
-       "</table>\n",
-       "<table class=\"simpletable\">\n",
-       "<caption>Total Variance of Point Estimate</caption>\n",
-       "<tr>\n",
-       "  <th>stderr_point</th> <th>ci_point_lower</th> <th>ci_point_upper</th>\n",
-       "</tr>\n",
-       "<tr>\n",
-       "      <td>1.739</td>         <td>1.16</td>           <td>6.247</td>    \n",
-       "</tr>\n",
-       "</table><br/><br/>Note: The stderr_mean is a conservative upper bound."
-      ],
-      "text/plain": [
-       "<econml.inference.PopulationSummaryResults at 0x2993656ffd0>"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "res.population_summary()"
    ]
@@ -436,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -459,22 +276,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA24AAAFNCAYAAAB49jzWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAACZs0lEQVR4nOzdd3gc1dXA4d/dptWqy5JsWbbcG25yAQyEakxvoSb06hACobcQvkBCEpLQa0IILUAgtNA7mGqKG+7dsnqXVtvL7P3+WNlYtiSrrXYln/eJH6yd2Zm7kqMz55ZzldYaIYQQQgghhBCJyxTvBgghhBBCCCGE6JgkbkIIIYQQQgiR4CRxE0IIIYQQQogEJ4mbEEIIIYQQQiQ4SdyEEEIIIYQQIsFJ4iaEEEIIIYQQCU4SNzGgKaVWKaUOiXc7+oJSqlgpdXi82yGEEEL0lFLqNqXUs/FuhxCJRBI3ERdKqTOVUouUUm6lVKVS6l2l1E96eM2nlFJ37Pia1nqy1npBjxobQyrqeqXUBqWUTylVopS6UymVtJv37fJZe9gOrZTytPw83Eqppt66difvL0mnEELEWSxicy+1a2RLnHLv8OeHPm6DVkqN7ct7CrEzSdxEn1NKXQPcB/wJGAwUAo8AJ8axWfHyADAfOBdIA44GDgP+294blFLmGLVlutY6teVPZlffrJSyxKBNQggh+kA/ic2ZO8Sp6V19s8Qp0d9J4ib6lFIqA/g98Cut9ataa4/WOqS1flNrfb1SKkkpdZ9SqqLlz33bRp+UUocopcqUUtcqpWpaegMvaDk2HzgLuKGlJ+7Nlte3j+S0TLv4r1LqGaWUq2Ua5ewd2taqN23nUS2l1CVKqY1KqQal1BtKqaEtr2/rCbTscO4CpdTFLX8fq5T6TCnlVErVKaVebHl9HHAZcJbWeqHWOqy1XgWcAhyllDpsh3Y8qpR6RynlAS5q67O2KFJKLW+514tKKfvu2r+7n1fL96tWKbVVKfVbpZSp5dj5SqmvlFL3KqUagNtafn53tYwcViul/q6USm45P0cp9ZZSqqmlDV8opUxKqX8TfUB4s+Xz3LC7dgkhhOg9nYjN+yilFrb8/q5USj2klLK1vLdbMbDl2P1KqVKlVLNSarFS6sButH1oS0xraIlxl+xw7Dal1MtKqWeVUs3A+S1x7V8tn6NcKXWHaukQ7SBef95yyR9a4tQZXf8uC9FzkriJvrYfYAdea+f4LcAcoAiYDuwD/HaH40OADKCAaALzsFIqS2v9GPAc8NeWnrjj27n+CcALQCbwBvBQZxrdkkT9GTgdyAe2tlynM/4AfABkAcOAB1tenwuUaa2/2/FkrXUp8A0wb4eXzwT+SHRU7hna/6ynA0cBo4BpwPk9bP+DRL/fo4GDiY4MXrDD8X2BzUBeS/v+Aown+vMbS/Tn9H8t514LlAG5RHtzfxP9uPocoAQ4vuXz/LUT7RJCCNF7dhebDeBqIKfl3LlEOx47o70YCPA90XiRDTwPvLRjh2Mn/YdobBkKnAr8SSk1d4fjJwIvE437zwFPA2GiMWoGcARwcUdt1Vof1HJ828yU7cmnEH1JEjfR1wYBdVrrcDvHzwJ+r7Wu0VrXArcD5+xwPNRyPKS1fgdwAxO6cP8vtdbvaK0N4N9Ek8POOAt4Qmu9RGsdAG4G9lNKjezEe0PACGCo1tqvtf6y5fUcoLKd91S2HN/mda31V1rriNba38G9HtBaV2itG4A3iQbEzrZ/SUtvapNS6oGWHsgzgJu11i6tdTFwN61/HhVa6wdbfp5+4BLgaq11g9baRXTKzc92+D7kAyNafn5faK11B59FCCFE3+gwNmutF2utv2mZGVIM/INoZ15ntBcD0Vo/q7Wub7nu3UASHcf0uh3i1HVKqeHAT4AbW669DHic1nFqodb6f1rrCJBOdEnCVS2jijXAvbSOU222VYhEIImb6Gv1QI5qf575UKKjQdtsbXlt+/t3CixeILUL96/a6b32DtrSbru01m6in6WgE++9AVDAdyo6PfPCltfriCYybclvOb5NaSfuA7t+vm3fm860f6bWOrPlz6+JJo42dv157PieHduVCziAxdsCK/Bey+sAfwM2Ah8opTYrpW7q5GcSQggRWx3GZqXU+Jap7lUtUw7/ROvOxY60FwNR0aUPa1qmJjYRneHR0XVzdohTdxGNbds6CrfpKE6NAKxA5Q5x6h9EZ4102FYhEoEkbqKvLSQ6MnNSO8criP5i3aaw5bXO6OnojZdo4rHNkPbapZRKIdpDWQ54Wl5u871a6yqt9SVa66HAL4BHVHQt3SfAcKXUPjs2oqUHcQ7w8Q4v7/zZuvpZO2p/e+r4sfdxm8Kd3qN3Ot8HTN4hsGZorVMBWkbtrtVajwaOB67ZYTqLjLwJIUT87C42PwqsBcZprdOJTnVXLce6FQNb1rPdSHQKf1ZLUSznDtftjAogWymVtsNrHcWpUiBA6wQwXWs9uaO2dqE9QsSUJG6iT2mtnUTXPD2slDpJKeVQSlmVUkcrpf5KdK76b5VSuUqpnJZzO7uPSzXRtVjdtQw4UyllVkodRetpIM8DFyililS0WMqfgG+11sUtUzrLgbNb3nshMGbbG5VSpymlhrV82Ug0iBha6/XA34HnlFJzWt47GXgF+Ehr/VEvftZ229/eG1qmk/4X+KNSKk0pNQK4hnZ+Hi3TUP4J3KuUygNQShUopY5s+ftxLcFaAc1E10wY3fw8QgghekknYnMa0d/bbqXUROCXO7y3WzGw5ZphoBawKKX+j+hUxq60uxT4GvizUsqulJpGdP37c+2cX0l0DdvdSql0FS2QNUYpdfBu2goSp0QCkMRN9Dmt9T1EE4DfEv2FXQpcDvwPuANYBCwHVgBLWl7rjH8Be7VMf/hfN5p2JdGRoCaia8K2X0Nr/TFwK9GkqpJoUPrZDu+9BLie6HSTyUQDyTZ7A98qpdxEC6JcqbXe0nLscqLz8Z8lul7vPWAB0cqSHenSZ+1E+9tzBdHe1M3Al0QTwCc6OP9GotMhv2mZTvMRP65XGNfytZto7+4j+sc99v5MNGFvUkpd14l2CSGE6EW7ic3XES2S5SLaQbdzcY7uxMD3gXeB9USnN/rp/LKAHf0cGEl09O014Hda6w87OP9cossAVhNNzl7mx2ULHcXr24CnW+LU6d1opxA9pqQ2gBBCCCGEEEIkNhlxE0IIIYQQQogEJ4mbEEIIIYQQQiQ4SdyEEEIIIYQQIsFJ4iaEEEIIIYQQCU4SNyGEEEIIIYRIcJZ4N2BHOTk5euTIkfFuhhBCiBhbvHhxndY6N97t6C8kPgohxJ6jvRiZUInbyJEjWbRoUbybIYQQIsaUUlvj3Yb+ROKjEELsOdqLkTJVUgghhBBCCCESnCRuQgghhBBCCJHgJHETQgghhBBCiASXUGvc2hIKhSgrK8Pv98e7KWIHdrudYcOGYbVa490UIYQQQgjRDnmWTlxdfZ5O+MStrKyMtLQ0Ro4ciVIq3s0RgNaa+vp6ysrKGDVqVLybI4QQQggh2iHP0ompO8/TCT9V0u/3M2jQIPmHlkCUUgwaNEh6boQQQgghEpw8Syem7jxPJ3ziBsg/tAQkPxMhhBBCiP5BntsSU1d/LjFN3JRSVyulVimlViql/qOUssfyfrFSVlbGiSeeyLhx4xgzZgxXXnklwWAQgAULFpCRkcGMGTOYOHEi11133fb3PfXUU+Tm5lJUVERRURHnnntuj9pRXFzM888/v/3rRYsW8etf/7pH19zmqaeeoqKioleuJYQQQgghxDbyLN07Ypa4KaUKgF8Ds7XWUwAz8LNY3S9WtNacfPLJnHTSSWzYsIH169fjdru55ZZbtp9z4IEHsnTpUpYuXcpbb73FV199tf3YGWecwbJly1i2bBnPPPNMj9qy8z+22bNn88ADD/TomttI4iaEEEIIIXqbPEv3nlhPlbQAyUopC+AA+l1m8Mknn2C327ngggsAMJvN3HvvvTzxxBN4vd5W5yYnJ1NUVER5eXmnr19bW8spp5zC3nvvzd577739H+pnn322vXdhxowZuFwubrrpJr744guKioq49957WbBgAccddxwAt912G+eddx5HHHEEI0eO5NVXX+WGG25g6tSpHHXUUYRCIQB+//vfs/feezNlyhTmz5+P1pqXX36ZRYsWcdZZZ1FUVITP52Px4sUcfPDBzJo1iyOPPJLKysre+HYKIYQQQog9iDxL996zdMwSN611OXAXUAJUAk6t9Qexul+srFq1ilmzZrV6LT09ncLCQjZu3Njq9cbGRjZs2MBBBx20/bUXX3xx+z+aJ598cpfrX3nllVx99dV8//33vPLKK1x88cUA3HXXXTz88MMsW7aML774guTkZO68804OPPBAli1bxtVXX73LtTZt2sTbb7/N66+/ztlnn82hhx7KihUrSE5O5u233wbg8ssv5/vvv2flypX4fD7eeustTj31VGbPns1zzz3HsmXLsFgsXHHFFbz88sssXryYCy+8sFWviBBiz+UJhOPdBCGEEP3IQHqWjkQ0F83/JZ9/9Q2Llv6A2+Pl1f+9znEn/pSZs2bxzL+fjemzdMy2A1BKZQEnAqOAJuAlpdTZWutndzpvPjAfoLCwsMNr3v7mKlZXNPdqO/cams7vjp/c7nGtdZsLB3d8/YsvvmDatGmsW7eOm266iSFDhmw/74wzzuChhx5q9/offfQRq1ev3v51c3MzLpeLAw44gGuuuYazzjqLk08+mWHDhu32sxx99NFYrVamTp2KYRgcddRRAEydOpXi4mIAPv30U/7617/i9XppaGhg8uTJHH/88a2us27dOlauXMm8efMAMAyD/Pz83d5fCDGwNftDbK71UDQ8M95NEUII0Q3yLN2xjp6lN23egicY5pNPP+H+e+7G5/PS2NDI+ImTmHfUsWgd/UwQu2fpWO7jdjiwRWtdC6CUehXYH2iVuGmtHwMeA5g9e7aOYXu6ZfLkybzyyiutXmtubqa0tJQxY8ZQX1/PgQceyFtvvcX69ev5yU9+wk9/+lOKioo6df1IJMLChQtJTk5u9fpNN93EscceyzvvvMOcOXP46KOPdnutpKQkAEwmE1ardfv/GUwmE+FwGL/fz2WXXcaiRYsYPnw4t912W5slSLXWTJ48mYULF3bqMwghBr6QEWFFmRO71RzvpgghhOhH+vuztNYaQ4MvEMTn83PtVb9mwZcLGTZsOH++4/cE+vBZOpaJWwkwRynlAHzAXGBRTy7YUTYfK3PnzuWmm27imWee4dxzz8UwDK699lrOP/98HA5Hq3PHjx/PzTffzF/+8hf+85//dOr6RxxxBA899BDXX389AMuWLaOoqIhNmzYxdepUpk6dysKFC1m7di3Dhw/H5XJ1+7NsS9JycnJwu928/PLLnHrqqQCkpaVtv/aECROora1l4cKF7LfffoRCIdavX8/kyX3//RdCxJ/WmpXlTnxBQxI3IYTox+RZumvP0sGwQSAcIRKJji1te5YeNCj6LP36/17lxJNOBiA1NTXmz9KxXOP2LfAysARY0XKvx2J1v1hRSvHaa6/x0ksvMW7cOMaPH4/dbudPf/pTm+dfeumlfP7552zZsqVT13/ggQdYtGgR06ZNY6+99uLvf/87APfddx9Tpkxh+vTpJCcnc/TRRzNt2jQsFgvTp0/n3nvv7fJnyczM5JJLLmHq1KmcdNJJ7L333tuPnX/++Vx66aUUFRVhGAYvv/wyN954I9OnT6eoqIivv/66y/cTQgwMm+s81LuD8W6GEEKIfqg/PkuHjAgA/lAEvcN8wMzMTM47/0L223smZ55xKjNnzt5+7MxzzuVXv7osps/SSuvEmZ04e/ZsvWhR60G5NWvWMGnSpDi1SHREfjZCDHw1Lj/LS53bv85KsTFrRFaPr6uUWqy1nr37MwW0HR+FEKIz5HmtawKh6ChbdyTbzFjNXRsXa+vn016MjOVUSSGEEP2YJxBmVS8vYhdCCCESlT9kEOxm0tYXYr2PmxBCiH4oZET4obQJw0icWRlCCCFErCR60gYy4iaEEKINqyqa8QaNeDdDCCGEiCmtNf6QQagfdFRK4iaEEKKVjTVu6lyBeDdDCCGEiKmI1viCBkYk8ZM2kMRNCCHEDqqcforrPPFuhhBCCBFTRkTjC4bpJzkbIGvchBBCtHB6Q6yudO7+RCGEEKKf84eMfpW0gSRuHaqvr6eoqIiioiKGDBlCQUHB9q+DwdjuabR27VqKioqYMWMGmzZt4oEHHmDSpEmcddZZXb7Wfffdh9frjUErhRADhT9k8ENZE5HEXpcthBCin0nU5+muTo985KEH4v48LVMlOzBo0CCWLVsGwG233UZqairXXXfd9uPhcBiLJTbfwv/973+ceOKJ3H777QA88sgjvPvuu4waNarL17rvvvs4++yzd9mdXgghAMJGhGWlTQlfTUsIIUT/k4jP08MLR3S5ANejDz3IGT87M67P05K4ddH5559PdnY2S5cuZebMmaSlpbX6BzhlyhTeeustRo4cybPPPssDDzxAMBhk33335ZFHHsFsNre63uLFi7nmmmtwu93k5OTw1FNPsXTpUu677z7MZjOff/45EyZMYPPmzZxwwglceOGFzJ8/nyuuuIIVK1YQDoe57bbbOPHEEzEMgxtvvJH3338fpRSXXHIJWmsqKio49NBDycnJ4dNPP43Ht00IkaC01qwod+L2h+PdFCGEEHuIeD9Pn3ve+Zx9/kXccM1VrFq1knA4zM233Mqxx5+AYRj8329/wycffYBSivMuuAitNZWVFRx39DwGDcrhrfc+jMe3TRK37li/fj0fffQRZrOZ2267rc1z1qxZw4svvshXX32F1Wrlsssu47nnnuPcc8/dfk4oFOKKK67g9ddfJzc3lxdffJFbbrmFJ554gksvvbTVP+D33nuPTz/9lJycHH7zm99w2GGH8cQTT9DU1MQ+++zD4YcfzjPPPMOWLVtYunQpFouFhoYGsrOzueeee7a/VwghdrS2ykW9O7ZTVYQQQoidxfN52pGeya233MJBhxzCw//4J01NTRx20P4ccthc/vPcs2wt3sIXC79v9Tz98AP389a7HzIojs/T/S5xO+SQXV87/XS47DLweuGYY3Y9fv750T91dXDqqa2PLVjQ9Tacdtppu2T6O/v4449ZvHgxe++9NwA+n4+8vLxW56xbt46VK1cyb948AAzDID8/f7f3/+CDD3jjjTe46667APD7/ZSUlPDRRx9x6aWXbh9uzs7O7vJnE0LsOYrrPJQ3+uLdDCGEEH0oEZ6lIb7P00ZE88nHH/HuO2/x4H33AhDwBygrLWHBpx9z4cXzE/J5ut8lbokgJSVl+98tFguRHVbz+/1+IDr96LzzzuPPf/5zu9fRWjN58mQWLlzYpftrrXnllVeYMGHCLq8rpbp0LSHEnqnK6WdjjTvezRBCCLGHitfzdCSi0Tr6vn8//yLjxvef5+l+l7h1lNU7HB0fz8npfq9Ae0aOHMlbb70FwJIlS9iyZQsAc+fO5cQTT+Tqq68mLy+PhoYGXC4XI0aM2P7eCRMmUFtby8KFC9lvv/0IhUKsX7+eyZMnd3jPI488kgcffJAHH3wQpRRLly5lxowZHHHEEfz973/nkEMOaTW0m5aWhsvlkqmSQggA6twBKfsvhBB7qER7loa+fZ42dLSa5NzD5/GPRx/hb/fch1KKH5YtZXrRDA6bO48nHn+MAw86uNXzdGpaKi63K65TJWU7gB465ZRTaGhooKioiEcffZTx48cDsNdee3HHHXdwxBFHMG3aNObNm0dlZWWr99psNl5++WVuvPFGpk+fTlFREV9//fVu73nrrbcSCoWYNm0aU6ZM4dZbbwXg4osvprCwkGnTpjF9+nSef/55AObPn8/RRx/NoYce2sufXgjR3zi9IVaUOaXsvxBCiITRl8/T27YBuOHmWwiFQuy/z0zmzC7ij7+PVp4874ILGT68kP33mckB+87i5RdfAOD8Cy/m1JOO57ij5sXiW9ApSuvE2Xlu9uzZetGiRa1eW7NmDZMmTYpTi0RH5GcjRP/iDoRZVNxA2Oj+7/2sFBuzRmT1uC1KqcVa69k9vtAeoq34KIQQnSHPa615AuEu7+HWWck2M1Zz18bF2vr5tBcjZcRNCCH2AN5gmKUljT1K2oQQQoj+TGsds6StL0jiJoQQA5w3GGbx1kYCIZkfKYQQYs/Vn5M2kMRNCCEGNEnahBBCiCgjgZaIdYckbkIIMUB5ApK0JSql1NVKqVVKqZVKqf8opezxbpMQQgx0MuImhBAi4bj8oZgkbcFAr15uj6SUKgB+DczWWk8BzMDP4tsqIYQY+CRxE0IIkVCa/SGWlDQRDPdu0vbJ23Z+fnQ6paW9etk9lQVIVkpZAAdQEef2CCHEgLZt4+3+TBK3Trj//vuZMmUKkydP5r777tv+ekNDA/PmzWPcuHHMmzePxsZGAL766iumTZvG3nvvzcaNGwFoamriyCOPpL3tFw455BAmTJhAUVERkyZN4rHHHtt+bOTIkdTV1bU6/6mnniI3N5eioiImT57MqaeeitfrbfPa7777LrNnz2bSpElMnDiR6667DoDbbruNu+66q9vfFyFE4nF6QyzZ2kiol5O2yjIT9/8+jYwsTX5+r156j6O1LgfuAkqASsCptf5g5/OUUvOVUouUUotqa2v7uplCCNFrEuFZurKmptX5z/37GUYXDuUn+85m31nTOefMM9p9lv7w/fc4+IA57D1jKrOLpnDLzTcC8Oc7fs8D993T7e9LV1n67E695KPV1b16vcP3Gtzh8ZUrV/LPf/6T7777DpvNxlFHHcWxxx7LuHHjuPPOO5k7dy433XQTd955J3feeSd/+ctfuPvuu3nllVcoLi7m0Ucf5e677+YPf/gDv/nNb1BKtXuv5557jtmzZ9PQ0MCYMWM4//zzsdls7Z5/xhln8NBDDwFw5pln8uKLL3LBBRfs0v7LL7+ct99+m4kTJxIOh1v9QxZCDBwuf4glpY0YMSj572wwkTckwh33ebBYMnr9+nsSpVQWcCIwCmgCXlJKna21fnbH87TWjwGPQXQft75upxBiYNpTn6Xb6tA8+ZTTuOve+wG46PxzePXllzj73PNanbN61Uquu+YqXnr1f4yfEH2WfvJfj3f129QrZMRtN9asWcOcOXNwOBxYLBYOPvhgXnvtNQBef/11zjsv+sM977zz+N///geA1WrF5/Ph9XqxWq1s2rSJ8vJyDj744E7d0+12k5KSgtls7tT54XAYj8dDVtaum+L+9a9/5ZZbbmHixIkAWCwWLrvssk5dVwjRf/hDBj+UOmOStAFMnBbmH6/VM3S4FDrpBYcDW7TWtVrrEPAqsH+c2ySEEDGRKM/SHU2TDIfDeD0eMjMzdzl2/713c90NNzF+wo/P0pf84tJOtaO3SeK2G1OmTOHzzz+nvr4er9fLO++8Q2nLAo/q6mryW+YM5efnU9MyBHvzzTczf/587rvvPi6//HJuueUW/vCHP+z2XmeddRbTpk1jwoQJ3HrrrbtN3F588UWKioooKCigoaGB448/fpdzVq5cyaxZs7r6sYUQ/YgR0fxQ2oQ/ZPT6tZcvsvKve1MJh8AkEaO3lABzlFIOFe06ngusiXObhBAiJhLhWbq9nO3VV17iJ/vOZuLYkTQ2NnL0scftcs7qVasomjGjG5+898UsDCulJiillu3wp1kpdVWs7hcrkyZN4sYbb2TevHkcddRRTJ8+HYul4xmmRUVFfPPNN3z66ads3ryZoUOHorXmjDPO4Oyzz6a6uu0h6ueee47ly5dTUlLCXXfdxdatWzu8zxlnnMGyZcuoqqpi6tSp/O1vf+v25xRC9E9aa1aWO3H5w71+bWej4s83ZPDFh0mEgu1PTRFdo7X+FngZWAKsIBqLZQ67EGJA6stn6X899TTfL17a6llaa91u5nbyKafx5beL2LCllL0mT+H+e+/u6ceNqZglblrrdVrrIq11ETAL8AKvxep+sXTRRRexZMkSPv/8c7Kzsxk3bhwAgwcPprKyEoDKykry8vJavU9rzR133MGtt97K7bffzu23387ZZ5/NAw880OH9cnNzmTlzJt9++22n2qeU4vjjj+fzzz/f5djkyZNZvHhxp64jhOh/1lS6qHX1fo1+reGuWzJwNpj47d1OklNkiVVv0lr/Tms9UWs9RWt9jtZaNloQQgxYffUsHTYgEI6QPShn+7N0qBNLCJRSHH3MsXz95Ze7HJu0114sW7q0qx85Jvpq4stcYJPWuuMhpAS1bdi2pKSEV199lZ///OcAnHDCCTz99NMAPP3005x44omt3vf0009z7LHHkpWVhdfrxWQyYTKZ2q1Ys43X62Xp0qWMGTOm02388ssv2zz/+uuv509/+hPr168HIBKJcM89fVf9RggRO2urmqlo8sXk2q/928E3nyUx/3oXYyf1/mieEEKIPUdfPEtHdljE1uB0bX+WDhmdW5u98OuvGDV69C6v//qqa7j7b39h44Yfn6UfeuC+Tl2zt/VVVcmfAf9p64BSaj4wH6CwsLCPmtM1p5xyCvX19VitVh5++OHtRUBuuukmTj/9dP71r39RWFjISy+9tP09Xq+Xp59+mg8+iFZ4vuaaazjllFOw2Wz85z9tfis466yzSE5OJhAIcP7557damzZt2jRMLQtMTj/9dKZNm8aLL77Il19+SSQSYdiwYTz11FO7XHPatGncd999/PznP8fr9aKU4thjj+2tb40QIk7WV7soa4hN0uZyKp5+KIX9D/Nz4pmxuYcQQog9R6yfpSM6ukfbJReeS7I9mUAwwNnnnkvRjJl4AtHOx/33mbX9Wfqnp5zK5ClTefWVl/jm66+I6AhDhw7j0cd2rRY5Zeo07vzrXVx43jn4fNFn6SOOOiYm36fdUe3thdBrN1DKRnRj0cla6w7rj86ePVsvWrSo1Wtr1qxh0qRJMWyh6C752QgRHxtrXBTXdTxy31Ob1lrIHWKQntk6RmSl2Jg1YtcKtl2llFqstZ7d4wvtIdqKj0II0Rl7wvOaP2QQbKPcv8WsCMeo2nJbkm1mrOauTWhs6+fTXozsi6mSRwNLdpe0CSGE6FjYiPBDaVPMkjato1UkAcZMDO+StAkhhBCJJqI1wXamQ/Zl0tYX+iJx+zntTJMUQgjROZ5AmO+KG2JSiGSbN19I5trzsvn+C1vM7iGEEEL0plA40n69/wEmpmvclFIOYB7wi1jeRwghBrKKJh/rql0x21wbYP0qC3//Sxr7HBRg1gHBmN1HCCGE6C0hI0KgjSmSA1VMEzettRcY1AvXIbpHqUgUsV4bKYQAlz/EuioXTd5QTO/jcSnuuDaDzEERbviTUzbaFkKIAWYgPkuHjAi+oBHvZvRIV5+nEz482+126uvrJVFIIFpr6uvrsdvt8W6KEANSJKJZX+3iuy0NMU/atIa7b02nusLMLXc5yciS37VCCDGQDMRn6YGStHX1ebqvtgPotmHDhlFWVkZtbW28myJ2YLfbGTZsWLybIcSA4w8ZLC9z0uyLbcK2jVJw4BF+pu0dZPKMvrmnEEKIvjPQnqUjkfaLkcSb1WzCbOr8yGZXn6cTPnGzWq2MGjUq3s0QQoiYq3cHWFnRHF1o3QdCQbDa4NBjYlfwRAghRHwNpGfpYDjCt1vqCYQSM3ErKswkJzUpZtdP+KmSQgixJyht8LKstKnPkrbGesVFJwzik7dlyrMQQoj+YW1Vc8ImbX0h4UfchBBioCup97K+2tVn9zMMuPPGDOprzBSODvfZfYUQQojuKm/yUdO8Z88QkRE3IYSIo+I6T58mbQBPP5TCkoVJXPHbZsZOksRNCCFEYvMGw6yv6ttYmYhkxE0IIeJkS52HTTXuPr3n158k8Z/HUjnmVC9Hnezv03sLIYQQXdXgCbK2qhkjkrhVMbWOFvuKNUnchBAiDiqafH2etAEUbzQzcWqIX/1Gei6FEEIkLk8gzIYaN3WuxJ4eGYnAPf+XzqRpIYqui+29JHETQog+tq33MB7OnO/l1PO92Gxxub0QQgjRLpc/RKMnRIM3SL07QKJvPac1PHhHGu+/lszgobHfV04SNyGE6EOeQJjlZU1E+rAoltbw8J/SOOQoP1NmhSRpE0IIkVD8IYMlWxvx9qNNtbWGx+5K5a0XHZxxkYezf+kBMmN6TylOIoQQfSBsRHD6QiwrbSJs9G0X4stPOXj9eQcrl1j79L5CCCHE7gTC/S9pA3jygRRefiqFE8/0ctHVblnjJoQQ/VXIiFDl9FPd7McTNPpsf7adLf7axuP3pHLgEX7OuNgblzYIIYQQbQkZEZaWNPW7pA0gJVVz7GleLrvZ1SdJG0jiJoQQvarBE6S80Uet29+n0yHbUllm4o/XZVA42uD6O5r7LLAIIYQQO4pENNUuP9XNAexWEyk2CylJFjbVunH7+9e2NA21JrJzI5xxkbfPqkluI4mbEEL0UMiIUNnkp6zRm1C9hm++4EBruO2BJpJTEnyFtxBCiAEnEtGUNfooafDiDyVOfOyu/z7h4PnHUnjwPw0MH2X0eYeoJG5CCNFFISNCnTtAsy+M0xfCHQjFfXStLRdf4+boU3wUjOidYOn2h/lsfS0Wk2L68MxeuaYQQoiBa2uDNy5b38TCS086+OfdaRxytJ+hw+OThEriJoQQXeDyh1he5sSXQCNrO3v/NTsz9g2SNzTC8FE9b2dJvZeP11bzXXEDIUOTk5okiZsQQogOBcMRttZ74t2MXvHSkw4euyuatN10pxNznDIoSdyEEKKTqpx+1lQ2Y0QSd9rhwk9t3H1rOsed7uPX/9f9TbbDRoTFJY18sraGTbUebBYT+4/J4aSiofx05rBebLEQQoiBqLje0+dVlGPhq4+TEiJpA0nchBBitwJhgy11HsoafPFuSoe2bjJz540ZjJ0UZv713UvamrxBPltfy+cb6nD6QuSlJXH67GH8ZGwODpuFrBTZBE4IIUTHfEGDssb+Ucm4wRNk8dZGAmEDh81CstWMzWLCiGhCRoTAEJh7gZcZR9bx0TqNEdnhj9atNgn/YmMtvz12L6zm2Oy4JombEEK0o9kfoqTeS40r/hUid6e5SfF/v8okya657YEm7Mmdf6/Wmo01bj5ZV8OSrU1EtGZKQQZzJ+ax19B0TFKOUgghRBdsqnUndNwMGREWFTfy9eY61la6aGtc0LVsOI7x1ZgdQcgrZuPS1scVYDIpdoyQJqW4+ehJWM2xabckbkII0YZaV4AfSpvi3YxOe+L+VGqrzNz1VCN5+Z2LloGwwXdbGvhkbQ2ljT4cNjOHTcrjkPG5DE63x7jFQgghBqJmf4gqpz/ezWjX5lo3T35dTKXTT06qjeOm5TNn9CAGpdjwhQw8AYOXH0/n9fezOLjAyZnnNmM2KcwmhUlF/2tWCpNp107NosJM7LHK2pDETQgh2tRfpnhsM/86NwcdGWCvotBuz611BViwroYvNtbhDRoUZCZzzpwRzBmVTVIMA44QQoiBLRLRrKvq/vrqWAqGI/xvWTkfrqkmM9nK5YeOZfqwDNQOs0pSTSZe+kcmrz+RyrwTffziKj9mc+IsEZDETQghduIPGTR4gvFuRqd894WNqbOCOFI0M+e03+aI1qypbOaTtTUsL3OiFMwozGLuxDzG5aW2ClxCCCFEd6yrduH07r4Dsa+VN/l49LNNVDn9HDw+l1NnDiPZ1rqjUmv4x19TeeWZFI4+xctVt7kwxWapWrdJ4iaEEDspa/S1WmycqL7/0satv8rklHO9zL+u7X1yPIEwX22q47N1tVS7AqTZLRw7NZ+DxueSLYVGhBBC9JLyJh/ljYlXxOvbLfU8vXAryVYz184bz6T89DbP87gU336exElne7nsJlefb67dGZK4CSHEDrTWVDoTL/DsrHijmTuuzWDkmDBn/3LXfXLKGr18tKaGb7fUEzI0Y3JTOH76UGaNyIpZtSshhBB7JqcvxLqq5ng3o5VgOMIrS8r4eG0N4/JS+cVBo8l07NphaRigI5Carnng+QZS03VCJm0Q48RNKZUJPA5MATRwodZ6YSzvKYQQPVHrChAIJXApLKCxXnHrZVnY7Zo/PNKEIyU6PKi1ZlVFMx+srmZ1ZTM2i4n9Rg/i0Al5DM92xLnVQgghBiJ/yGBFmTMhqkhqrdlU6+HrTXV8X9yIL2Qwb9JgTplVgKWNeY+hINx5UwZms+bmvzaTlpHY021iPeJ2P/Ce1vpUpZQNkCcHIURCK2tK/NG2e/4vg8Z6E3c/1UBefgStNSsrmnltaTklDV4yk62cPKOAg8bnkpokEyuEEELERoMnyMpyJ8Fw/LI2py/E2qpm1la6WFPVTJ07iM1iYlZhFgeOy2H84LQ23+f3we+vzuT7L5L4xfWJOTVyZzGL6EqpdOAg4HwArXUQ6B+r/YUQA1YgbJBkabtyojcYpsGd+L+mLrvJRWmxmQlTw6ytauZ/SyvYWOsmJ9XG+fuPZM6obCwyHVIIIUQMFdd52FTrjsua8EhEs6LCycdralhdGZ2imWw1M2FIGsdNG8rsEVkdluX3uBW3/iqTlYutXH1bM8eclvidthDbEbfRQC3wpFJqOrAYuFJrvetiDCGE6AMhI8J3WxooGp5Jmt26y/GyBFxUvaMlC23MmBMkf7iB1+7k7g/KWVPlIjPZyln7FnLg2BxJ2IQQQsTUtmn58dirzRc0+GJjLZ+uraXWHSDLYeWE6UOZVpBBYbajzb3VdqY13PbrDFYvs3LzX50cekygD1reO2KZuFmAmcAVWutvlVL3AzcBt+54klJqPjAfoLCwMIbNEULs6coafQRCEZaVNrH3yOztvXFaazbWuCmpT9y92955OZl7f5fORTfXUZO/luXlTtLsFk6fPYxDxudhs0jCJoQQIvY21Lj7PGmrcwf4eE0NX2ysxR+KMDY3lZNnFjCjMLPNtWsdUQrOucyDz+Nl34MTf5bNjmKZuJUBZVrrb1u+fplo4taK1vox4DGA2bNnJ/aKQCFEvxWJaEoboolZIBRhaUkTs0dmoYAV5U7qE3iK5OKvbdz/+zQKpzXzifE99lrFyTMKOGxiXodTQYQQQojeVNrg7ZNOzrARYXOdh9WVzaypbGZznQcTitkjs5g3aTAjc1K6fM2SzWZWLLJx7Ok+ps1OvL3mOiNmiZvWukopVaqUmqC1XgfMBVbH6n5CCNGRCqev1eJpTyDMD6VNBI0I3oARx5Z1bPM6C7ddlY49140+bCGzR2Xys72Hk56861RPIYQQIlZqXH7WV7tieg9f0GDB+ho+WF2Nyx9GKRg1KIXjpuZz4Lju7z+6drmF316WhcmkOfgoP6np/XOsKNblxq4AnmupKLkZuCDG9xNCiF1ordvsIWzyJnaPW3m9n6svGUxIBZl45lIuPHIk04ZlxrtZQggh9hBaa5y+EA2eIFvrvTErROINhvlwdTUfr63BGzSYPDSdg8blMnFIGik9rI78/Zc2fn9VBpmDItz5WFO/Tdogxomb1noZMDuW9xBCiN2pdQXwBhN3VG1nTl+It5dX8tn6WrKOrObAqamcdfRYmRYphBCiz2yodlHW5MMwYpfoBMIGn6yt4d2VVXiDBjOGZ3LM1HxGdWMqZFs+fsvO325JZ+TYMH/6exPZuQmw2VwPyAY/QogBrziBi47syOUP8d6qKj5ZVYd7ayZHzMvhhFOHkiHTIoUQQvQhlz/E1hjGzrAR4YuNdby1vBKnL8TUggx+OqOAwuze3fLZ41JMnRnitgebSEntvyNt20jiJoQY0JzeEM2+xJ4SGQgbvLeyig9WVxMIRdCf7kv1okEcdGY9Gcn9Z6RQdJ5SKhN4HJgCaOBCrfXCuDZKCCFabKmLze5d4UiErzfV89bySho8QcblpXLpQaMZ184m2d0RiUQLkYwca3DCz30ce7oP8wCZsCKJmxBiQKtwJvbebCvLnTz77Vbq3EFmFWbh+Xwy73+fwUVXuRgxRpK2Aex+4D2t9akt68B7t5tZCCG6yeUPUdPcu3ubhY0I32xu4O2VldS6Aowc5ODcOSOYPDQdpXa/91pnBQPw199k8O1nSTz+Rh2Dh0YGTNIGkrgJIQawSERT3dz3G4R2RrMvxPPflbBoayND0u1cd8R4Vrw7hFdfTOOks72ccXH/mN4puk4plQ4cBJwPoLUOAom7H4UQYo+yubb3RttCRoQvN9Tx7qoqGjxBCrMdXH7oWKYPy+jVhA3A5VTcdmUmy7+3Mf86F3n5/Xs9W1skcRNCDFi17gDhGC6q7q7NdW4eXbAJlz/MiUVDOWryELaut/HYXWkcfJSfX97oopfjmUgso4Fa4Eml1HRgMXCl1jo2c5OEEKKTmv0hal09H20zIpqvNtbxxg8VNPlCjMlN4Zw5I5jSyyNs21SWmbjll1lUlZq5+S9ODjsuMTtte0oSNyHEgFXRlHjTJD/fUMvz35aQ6bDym2MmbV+IPXZSmN/e08R+hwYwmeLcSBFrFmAmcIXW+lul1P3ATcCtO56klJoPzAcoLCzs80YKIfY8PR1t01qzpKSJ15aWU9XsZ0xuChcfOIoJg9NikrBt89aLDhrrTPz5n41M3zux17X3hCRuQogByR8yaPAkzuwzI6J5/rsSPltfy+T8dC45cDSpdgurf7BisWjGTw5z8JG9u6ZAJKwyoExr/W3L1y8TTdxa0Vo/BjwGMHv27MQbOhZCDCgNniB1PRhtq3H5+fc3W1lT6SI/w86vDhlD0fDMmCZsAT8k2eHCK90cd7qP/OEDe224JG5CiAGputkfs41CuypkRHjs880sLW3iqMlDOHlGASaTonijmd9emsmQYQYP/7dBpkfuIbTWVUqpUqXUBK31OmAusDre7RJC7LlCRoRVFc5uvdeIaD5aU83ryyowmeDMfQo5eHwuZlPsgprW8OozDl7/TzL3P9tIVk5kwCdtIImbEGKAqmhKjPnt/pDBQ59uZG2Vi5/tPZzDJw0GoKrcxE2XZGFN0vzffU2StO15rgCea6kouRm4IM7tEULswdZVuQiEul7Mo84d4O+fbaK43kvRsEzO3LeQ7BRbDFr4IyMMD/8pjTdfdPCTw/0kpwy8IiTtkcRNCDHgOH0hPIFwvJtBsy/EA59soKTBy4UHjGT/MTkANNSauPHiLAJ+xT1PNzKkYM8JOiJKa70MmB3vdgghRJXTT5Wz652dy8uaePzLLWgNvzhoNLNHZMV0WiREN9T+43UZfP9lEqdf6OGiq9171Lrw3X5UpdRpnXlNCCESRWlD/EvpLytt4ndvrqKs0cdlh4zdnrQB/PcJBw21Zv74aBOjxsc/wRTdJzFSCNGf+UMGa6uau/SeSETzv2XlPPDJRgal2Lj1uEnsPTI75kkbwD/+lsqSb2xcfVszl1y7ZyVt0LkRt5uBlzrxmhBCxN2mWne3eg57iz9k8ML3pXy5sY7hWclcO288w7Ja76188TVuDj/Bz9hJkrQNABIjhRD91qZad5e3zfnv4lI+WlPDAWMGcda+I7BZYp89aQ1K/Rg/p80euJUjO9Ju4qaUOho4BihQSj2ww6F0QJ42hBAJZ0udhy29uHFoV62rcvHk11uo9wQ5esoQTpg+FKs5GtCCAfjnPWmcOd9N1iAtSVs/JzFSCNHfaa2pc3et+vKCdTV8tKaGwyfl8bO9+2abkvdetfPpO3bueKSJ9Ey9xyZt0PGIWwWwCDiB6Oag27iAq2PZKCGE6Ig/ZLCmspk0u5WcVBsZyVZKGrxsqnHHpT2BsMFrS8v5aE0NuWlJ3HDEBMYNTtt+PByCP1yTyTcLkpgyMyhl/wcGiZFCiH6t2RcmFO78GuvVFc08/10JUwsyOH3W8Bi2LMoIR6dGvvZsCjP3DxAMKqy2BCkXHSftJm5a6x+AH5RSrwEerbUBoJQyA0l91D4hhGjF5Q+xrLSJQChCvTtIcZ0Hs1lhdHGqR2+pdPp46JONVLsCHDYhj1NmFpBkNW8/bhjwl5sz+GZBElf8tlmStgFCYqQQor+r83Q+HlU6fTz62SbyM5KZf+BoTDEs9Q/Q3BQtQrJkYRKnnOvhkmvdmKWk4u6LkwAfAMk7fJ0MfBSb5gghRPsaPUEWb23cpWRxvJI2TyDMg59sxBsyuHbeeM7ct7BV0haJwL2/S2fBu3YuudbFCT/3xaWdIqYkRgoh+qX6Tk6TjFZI3ojFrLjisLEk28y7f1MP/fmGDFYssnHtHU4uvVGStm06822wa623zz/SWruVUo6O3iCEEL2hrNGLyx/GiGgiWlPnDhBJkMr5kYjmn19spt4T5PojJjA2L3WXc9zNilVLrZxzmZvTL4x/pUsRExIjhRD9TiBs0Ozb/VqxQMjgwU830uQNct0RE8hJje2Egm1FSOZf58bjVkyZueeuZ2tLZxI3j1JqptZ6CYBSahYg3cZCiJgKhiOsr3YlTKK2s9eWlbOyoplz54zYJWnTOjralp6peejFBhwpe/ac/AFOYqQQot/pzGibEdE89sVmius9XHbwGMbk7tpB2VsiEfj3Iyk01pu46ncu2SqnHZ1J3K4CXlJKVbR8nQ+cEbMWCSEEUN7kS9ik7fviBt5dWcXB43M5aHxuq2Naw+P3pFJTaeamO52kpErSNsBdhcRIIUQ/s7vETWvNf74r4YcyJ2fuU8iMwqyYtcXdrPjLTRl881kSR5zkwwgjUyPbsdtvi9b6e6XURGACoIC1WmsZtxRCxEwkoilrTMyphasqnPzryy2MzU3l53vvWlXr6YdS+O8TKRx/hhdT7JcBiDiTGCmE6G+01tTvpjDJp+tqWbC+lqMmD+GwiXkxa8uWDWZu+3Um1RVmfvWbZk4800cf7OPdb+02cWuZq38NMEJrfYlSapxSaoLW+q3YN08IsSeqcQV2KUCSCFZVOHno043kZ9i5/NCxWMyt6zv9+5EUnvt7Kked4uPy37ok+OwBJEYKIfobpy/U4abbW+o8vLiolGnDMjh5ZkHM2hHww00XR0fy7nqycft6NpMJRuekkuWwEdHRNe7+cIStdR68QaNL9ygc5KCkPjE7grujMwORTxLdo2a/lq/LgJcACUpCiJgoaUi8X7KrK5p56NONDE63c8288aTaW//6fOGfDp55OJUjf+rj6tuaMXWmZq8YCCRGCiH6lY423fYGw/zj801kJFu58IBRmGLQA2mEwWSGJDvc9BcnhWMMBuVGO2sdSWamFGSQbrfu8r78dDsVTh9b6jyd6tzNdFgZPziNZKuZdVWuXv8c8dCZxG2M1voMpdTPAbTWPqWkH1kIERtN3mCnKl31pe+2NPDk11vIS7Nz7bzxpLURUKbMDHH8z7xcfotLkrY9i8RIIUS/Uudue5qk1ponvy6m0RPihqMmkJrU+wvN6qpN/OGaDA471s+p5wY48ggT23Ync9jMjMpJxdzOHnEmk2JYloOhGcm4g2F8QQNv0KCm2Y/Lv2sxk8LsaIHf4dkOIlqzodq9yzn9TWd+IkGlVDKgAZRSYwDZQVYIEROlDYlTkM/tD/Pst1tZtLWR0TkpXHHY2F2StrXLLUycFmbKrBBTZiVWwin6hMRIIUS/4QsauNtIcgA+XlvD0pImTp89LCYVJJd+Y+VPN2Tg9ypOOTdC0bBMMhy7doTujsmkSLdbt4/KDUm3883meozIj9M/k21mctN+3LpgxKAUwhHNllpPzz9IHHWmX/h3wHvAcKXUc8DHwA0xbZUQYo8UCBvUuPzxbgYAK8ud/O7NVSwtbeKnMwq48aiJrZI2raOFSK74+SAWf22LY0tFnEmMFEL0G6XtFP5q8gZ5dWk50woymDdpcK/eMxKBZx9N4caLs0jPiG6T86uL7N1K2tqSbDMzKiel1WvDsxzsPPlh1KAUzOb+PSGi3RE3pdQBWuuvgM+Bk4E5RCtmXam1ruvMxZVSxYALMICw1np2j1sshBiw6t1BdAJUzy+u82xfz3bV3HEMz269n7LW8NQDKTz/WHRNW9G+u98PRwwsvREjhRCiLxkRTUVT27Na3l5RiWFozth7+C4JT2eYTQqi/4sWFNlhCdraFVaefiiVucf7uPJWF5NHOhicbu/mp2jbiEEOqpr9uP1hzGbF0Mxdr28yKXJTk6hyJkYHcXd0NFXyAWAWsFBrPRN4u5v3OFSCmBCiM9qbd9+XvMEwf/98E+l2K9cfMWGXIiRawxP3pfLC4ykcfYqXq26TNW17qN6KkUII0ScqmnxtVpOsbvbz+fo6DhyX0+WEymoxMXloOjmpP05L9IcMVlU42bDZIGdwhL2mh3joxXrGTw6Tn2lndAymYSqlmJSfzqLiBgoyk3ep+rxNbtrATdxCSqkngWFKqQd2Pqi1/nXsmiWE2NNorWnwxHfkSmvNUzsuzLbv+ity1RIrLzyewnFneLnit5K07cEkRgoh+pX2pkm+vqwCs1lx3LT8Ll0vw2FlakEGdmvrTUutJjNvP53FHXfAX5+Ilvnfdx/FqJxMBu2Q4PW2jGQrw7MdDM9ytHvOoBQbJhOtRgT7k44St+OAw4HDiJY67g4NfKCU0sA/tNaPdfM6QogBrsnb8b4yfeGTtTUsKWnitFntL8yeMivE355oYPo+Idmnbc/WGzFSCCH6RL07gDew6x5oJfVevitu4JipQ8h0dH69dkFWMhMGp2HaqQJkZSWccw58/LHirLPgpLkppKUR04RtR+PyUjuc6mkxm8hy2KjvYEuERNZR4na91vpGpVSh1vrpbl7/AK11hVIqD/hQKbVWa/35jicopeYD8wEKCwu7eRshRH9X74nvNMlNtW7+u7iM6cMyOGKv1guzDQMe/lMahx3jZ8qsEEX7xrd6ZEqShW2xUgMhI5KQG5YPcL0RI4UQok+UNra9tu3VpWU4bGaOmjyk09fKTUtiUn76Lq+/9x6cey643fCvf8EFF4BSfZOwbdOZ9Xl56fYBmbgdo5T6LfAz4K/dubjWuqLlvzVKqdeAfYgu5N7xnMeAxwBmz56dAGUJhBDxUOuK3y/RWleAhz7dSJbDygUHjGr1iz8cgr/cnMGCd+3kDTHiWvI/Ny2JUbkpbW5MGolo/GEDfyhC2IgQNCKEDY03aOAOhPEEwxhxHtEcYHocI4UQoi94g2HqXLt2jq6tamZlRTOnzRqGw9a5PdtSkixMHrpr0gawYgUMGQIvvAB77dWjJsdUTqoNpUiIYmhd1dFP6T2gDkhRSjXv8LoCtNa67Z/atpOUSgFMWmtXy9+PAH7f0wYLIQYef8jAE2h7X5lYcwfC3P/xBoyI5qq541ttOBrwwx3XZPLNZ0lccq2L0y9se31ArOWlJzEqJ6XNjb+3MZkUDpuFjma6eINhGr0hGj1Bmrwh/KFdp82ITutRjBRCiFipbvZT1ujFajZhNZvwBnf9Xa+15pUl5WQ5rBw6Ia9T17WYFdOHZ7Qq/LFuHZSVwdy5cO21cMUVYO/dgpG9LsliJiPZSpO3/+292m7iprW+HrheKfW61vrEblx7MPBaS8+1BXhea/1e95ophBjI4lVNMmREeGTBRurcAa6ZN54hGT9GG58X/u9XWfzwvZUrf9fMcaf37cbgSsHgdDsjc1JaJZM9EU3sLBRkJgPRpLXOFaDWHaDZF+qXvY/x0gsxUgghel2Ny8/Kcuduf58vKWliS52H8/cbic2y+ypbSsHUgoztI3Naw+OPw1VXwfDhsGoVmM2Jn7Rtk5uWNLASN6XURK31Wq31iUqpJK11YIdjc7TW33R0Ya31ZmB6L7ZVCDFAxWOueSSiefKrYtZXu7nkJ6MYPzit1XFbEmTnGtzwZx+HH983pYOTrCYyk21kOqzkpCaRbDPv/k09kJpkITXJwsicFPwhg5rmAJVOHy5/fEY/+5OexkghhOht9e5Ap5I2I6J5bWk5QzPs7DdmUKeuPTjdvr3ASH09XHIJvPYaHH44PP10NGnrT3LTkthQ7Y53M7qso27c54GZLX9fuMPfAR7Z6WshhOiWSETT4O3bxC0S0Tz5dTHfFTdwyswC9h39Y+CqrzURMSB3SISb/tIc08qRVouJLIeV7BQb2Sm2Tq8xiAW71UzhIAeFgxx4AmEqmnxUOP2EwlL0pB0SI4UQcVPrina0beuAA1hV0dypMvdfbayjqtnPrw4ZE904ezeUgtG5KUC0auTMmdHk7a674Oqr6Zfb4jhsFlLtFtz9rKOyo6cE1c7f2/paCCG6pdEb7NOiGRGteWphMQs313Ni0VCOnvLjvjUVJWZuvCSTjEzNgy80xCxpy0qxMjzLQW5aUqcqYPW1lCQL4wanMSY3lRpXgPImL42e/jelJMYkRgoh4qLRE2RluRMjoqmha0sNAmGD13+oYExuCkXDMzv1nsHpdpKt0ZRhyJBotcjTT4eioi42PMHMGT2IQEtRL1/QYH21i2CCd1Z2lLjpdv7e1tdCCNEttX24vi2iNU9/XczXm+o5YfpQjp82dPuxLest3HRJJqGQ4pa/NcYkaUu1W5hSkNFra9ZizWRSDMmwMyTDjssfoqTBS0gqU24jMVII0eea/SGWlTVhRLr3a+bjNTU4fSEuPWh0pzoOlYLGrSnMPgGefx4mTIA//albt05ISRbz9mIljiQzi7c2JnQF5o6eHoYppR4g2nO47e+0fF0Q85YJIQY8X9Cgoqlvin4YEc3TC6NJ2/HT8jlh+o9J24rFVm79VSbJDs29/2pgxNjYVFucNCS93yRtO0uzW5k8NININx8WBiCJkUKIPuUNhlla0tTtxGJznZs3fqigaHgm43Za192WcAj+92QGjz9kIT8/Oj1yIEu3WykalsnS0sZOTTmNhw434N7h74t2Orbz10II0WUba9x98ssxHInw+BdbWLS1kROnD+W4aT9Oj9Qa/nl3Ktk5Ee58rJG8obFp0JAMOxmO9sv59xemTqyH2ENIjBRC9KkN1e5urztu8gZ5+NNNZDqsnLffiN2eX7zRzF9vzmDDaivnngv33w+Zmd26db+SlWJjakEmy8uaErLSckfbATzdlw0RQuxZnN4Q1c2xr9YYMiL8/bNN/FDm5LRZwzhy8pDtxwwjWgnrtvudmC2ajKzY/JY2mxVj81Jjcm0RHxIjhRB9yYhoGjzdK+QVDEd46NON+EMG1xw+qcM9Qbd579Vk6qrMvPIKnHxyt27bb+WmJTF7ZDbFdR5q29i4PJ76YR0YIcRAsL7GFfN7RCKaRxZEk7az9i3cnrRpDc8+msLtV2ZghCE7NxKzpA1g1KAU7NZ+VitZCCFEwqj3BLq1rk3r6DKB4novF/9kFAVZye2eW7LZzLqVFpSC//tdhNWr1B6XtG2TkWxl+vBM5owZ1GqP13jrn4sthBD9WpXTj7MPNr5844cKVpQ7OXvfQg6ZkAeAEYYH/5jG2/91MO8EX8ynQjhsZgqzHbG9iRBCiAGtOyM/nkCYf3+zlUVbGzmpaCgzCrPaPM8Iw0tPOnjmkVTGTQrz5VearBRbT5s8IKQmRYuKDc1MZlWFk0AovovfdjvippQ6oDOvCSFEZ0Qimk21sd/0ckW5k7dWVPKTsTnbkza/D26/KoO3/+vgZ5d4uP5PzVhiuOzMZIKJ+emyLmwAkxgphIg1rTV17q5Nk9xQ7eL2t1aztKSJU2YWcOzU/DbP27LBzK/PzOZf96Wx/6FBPnjHIklbG7JTbMwZPYjB6a1H30wmMJvU9j+xjvadGXF7kF03Em3rNSGE2K2tDV58wdhUbdym3h3g8S82MzwrmTP3Kdz++h+vy+Tbz2z86jfNnHRWbKtZmkwwtSCTbAmAA53ESCFETDV5Q50uSmJENG+vqOTN5RXkpCRx49ETGJ3T9hrrtcstXH1ONqnpmt/e08T882wUZEtHY3usZhNTh2UwKpCCWSmsZoXF3LerztpN3JRS+wH7A7lKqWt2OJQOyGINIUSXBcIGxfWemN4jZER49LNNRDRcevAYbJYff6me9Qs3R51s5oC5sV1svC1py01Liul9RPxIjBRC9JXO7nda5w7w+Bdb2FjrZs7obM7aZwTJtl1/HXncipRUzbjJYX52iYcTz/SSkwtDMzN6u+kDUjy39enozjYgteWcHTd7aAZOjWWjhBAD08Yad0w3tnT6Qvz9s00U13u57JAxDE63s2qplRWLrPzsEi8Tp4WBcMzuD5K07UEkRgoh+kRn1rd9t6WBf3+zFY3m4p+MYs7oQbuc4/MonnowhU/esfPYa/VkDdKcd3m0M7Ug04FZpvUnvI62A/gM+Ewp9ZTWemsftkkIMQA1+0NUNsWu/H9xnYeHF2zEEzC45MBRzCzM4tN3kvjbLRnk5RuccKYPR0psK5GYTYppwzIYlCpJ20AnMVII0Rdc/lCHywtc/hD/+a6U74obGJObwsU/Gd1mx+G3n9t44Pfp1FSaOf4ML7YdTlEKhksRrX6hM2N9SUqpx4CRO56vtT4sVo0SQgw866tiV/7/m831PL2wmDS7lZuOmsjwbAfP/SOFpx5IZeqsILc90BTzpM1iVhQNzyTTIWva9jASI4UQMVPTwWjb4q2NPPvtVrxBg5OKhnL0lPxdRs1CQfjrbzJY8K6dEWPC3PvvBqbMbF3VOTctSbas6Sc6k7i9BPwdeByIbUUBIcSAVOX00xSD8v8RrXl1STnvrapi/OBUfnnwGNLsVu67LY23X3Iw9zgf1/yhGVuMcymrxcSMwkzSO7GpqRhwJEYKIWKmrWmSRkTzzMJivtpUT2G2g2vmjWR4VtsjZlYbmM2a8y53c8ZFHqxtxMP23isST2cSt7DW+tGYt0QIMaD4QwbVzX6qmwM0+3o/afOHDP75xWZ+KHNy8Phcfr7PcCymaCGSyTNDZOe6OecyD6oPpuxPH5YhSdueS2KkECImypt8uP2t12WHIxH++cUWFm9t5Nip+Rw/PX977Ntm01oLD/85jat+10zhaIMb72xuNxam2qX8f3/SmcTtTaXUZcBrwPa0X2vdELNWCSH6HV/QoMEbxOkN0ewP7RJsektEa9ZWunhxUSmVTh9n7lPIoRNyKd1ioazYwv6HBZh3QuzW0u0sw2GV6ZF7NomRQogu8YcMtI6uLTObFNY2SspvqfOwqab1nqdhI8I/Pt/M0tImTp89jCP2GtLquMetePrBFF5/3kF6pqa2ykzhaKPNpM1uNZOVYmVYpoy29SedSdzOa/nv9Tu8poHRvd8cIUR/1OgJsqy0CSMSu3VkLn+ILzfW8fmGOmpdAdLsFq6aO569hqazZKGN31+dQUqqZvZPAjGfGrmjQlnQvaeTGCmE6BSnN0RxvWeX6Y+OJDPDMh3kZ9qxmk2sr3ZRUu9tdU7YiPDIZ5tYXubk53sPZ+6kwa2Of/ZeEo/cmUZjnYljT/dxwa/dpGdGY3Kmw0p6shW7xUyS1USa3YLDFr+S9qL7dvtT01qP6ouGCCH6pwZPkB9inLQt2trAMwujC7DHD07lpOlDmTkiC4vJxP+eS+bRv6RRODrMHx5u6tOkzW41kydl//doPYmRSikzsAgo11of13utEkIkCn/IoMETpNLpo9HT9rIBb8BgfbWLTbVuHDYzrjZmrLy9opLlZU7O2reQQyfk7XJ8/Uorg/Ii/P7BJiZM/fH9yTYzMwqzpNT/ALHbxE0p5QCuAQq11vOVUuOACVrrt2LeOiFEQnD6QpQ1eqlzBxmUYmNoZjLZKTbq3QGWlzljlrQFQgYvfF/KFxvrGDnIwQX7j6IgKxmASATuuy2Nd152sN+hfm76S3PMK0fubFhWMqovFtGJhNXDGHklsIbopt1CiAHCGwxT1uijzh3AG+h8zSIjottM2iqdPt5dWcW+o7K3J20et+Lfj6Swz0FBZs4Jct4Vbi60gHmn4pCT8tMlaRtAOjNO+iSwGNi/5esyolW0JHETYoCrafazpc7TKpBUOf1UOf0k28wEw5GYJW3VzX4e/GQj1c1+jpkyhBOKhrZagG0ygSNV8/P5bs6/woNp1yUCMWU2qe1JpNijdStGKqWGAccCfySa+Akh+rlaV4DSRi8N7mCvXVNrzb+/2YrNYuL02cOJROCjN+w8fk8qTQ0m0jI0M+cEW+3Ltk1BVrSTVQwcnUncxmitz1BK/RxAa+1T0sUsxIDW5A2yocaNs4MS/h1tCNob97/3o/X4QxGumTeeSfk/DkhsXGPBMGDClDDzr3P3SdXItmxbiyD2eN2NkfcBNwBpsWycECL2tNasqXRR0eTr9Wt/vbme9dVuzpkzgqpNyfz2j2msXW5j0vQgf3i49bTIHdmtZsblpfZ6e0R8dSZxCyqlkokutkYpNYYdKmcJIQaOsBFhdWUzNc3x+7+4OxDm3o824PKHuf6ICYzMSdl+7JO37dzzf+mMmRjivmcb45a0gRQlEdt1OUYqpY4DarTWi5VSh3Rw3nxgPkBhYWFvtVcI0YsiEc3KCmdM4qbLH+KlRWWMyU3hwHE5vPWClZoKMzf8ycnc4/0dzjSZmJ+GRToXB5zOJG6/A94DhiulngMOAM6PZaOEEH3PGwyzrLSpS/Pxe1sgZPDgJxuobvZz5dxx25M2Iwz/ui+Vl55MYeqsILfe2xTXpG14tkMqcoltuhMjDwBOUEodA9iBdKXUs1rrs3c8SWv9GPAYwOzZs/t2AacQYrfCRoQfypw0enpvauQ2Ea3595dlVCwYyX4HZmNSEY49zcfc4/2kpLb968BiVuRnJDM0006a7C06IHWmquSHSqklwBxAAVdqreti3jIhRJ+pdwdYUe4kbMTv2TAcifDo55vYXOfhlweP2T490uNW3H5lBku/SeL4M7z88iYX1jhO2c9OtTF+sEw/EVHdiZFa65uBmwFaRtyu2zlpE0IkNpc/xMryZjyB3t+z1Iho7njQxcIXpmE0O6jI8wIuzBbaTdrG5qVSmO3AJIVIBrTOdhkXAOaW8w9SSqG1frUzb5Ryx0IkLk8gTHG9hyqnHx3H/nytNc99U8LK8mbO3W8EMwuzth+zJ2vsyZpr73By1E/7bmPttjhsZqYWZEglSbGzbsdIIUT/orWmuN7Lljo3kUjvX3/dSjO332yndvMQckZ4ueG+Bmbs2/56c4BBqbZWywrEwNWZ7QCeAKYBq4Bt/0Q10NmgJOWOhUgwLn+ILXXRTUDjmbBt887KKr7YWMdxU/M5aFwuAB++YWfGnCA5eRFuf9AZ16mRAGazYvrwTClIIlrpaYzUWi8AFsSibUKI3tXkDbKxxk1TB4W7eiKiNa9/W09DzQgOuaSYGy+3Y7F0HPwsZtWqgJcY2Doz4jZHa71Xdy4u5Y6FSDy1rgAry2O391pXfbO5nteWljNndDYnFg0lGICH/xTdn+20CzxxrRy5jdmkmFaQQUqSrGsTu+h2jBRCJD6tNdXNAUoavDT7ej9hczYqnv17KgHlxzv1B4qtXs65182Z+xV0anbHuMFp2K3m3Z4nBobOPIUsVErtpbVe3Y3r34eUOxYiYVQ5/ayqcCbEKBvAqgonT35dzITBaZy/30iqK8z8/upMNqyy8vP5bs673BPvJmI2K4qGZZIle+GItvUkRgohEpjLH2JFuTMmRbt8Xnj13w5e/FcKPq8ibWY1Y8aGuOiAUcwZnd2ppC071UZBpuwnuifpTOL2NNHAVEW0xLECtNZ6WkdvknLHQiSW0gYv66tdCZO0rSx38tCnGxmaYeeyQ8awYZWNW36RRUTD7Q82sf9h8d91xGoxMaMwk3SpziXa160YKYRIbKUNXjbUuHp1HVswHKGsycsnH5p4/cEC/E4byeOqGHHoBn56WBqHT5pCkqVzo2dWi4m9ZIrkHqczidsTwDnACn6cv98ZUu5YiARR4/KzrsoV72Zst7ysiUcWbGJoZjLXHD6elCQLhaMMivYNcvE1boYWxm9LAogGxDS7hQmD02R6pNid7sZIIUQf01rjCxl4AgYaTWqShWSrefvolj9k4AsalDX6qG7ueTGsBk+QddUu1le52FTjoaIuBLYwwdpULJmZzPvFeg480MSUoaO6VL7fYTNTVJgpUyT3QJ15IinRWr/R1QtLuWMhEoM/ZLC6ojnezdjuh7ImHm1J2s4vmsgTf83g0htdpKRp/u8+Z1zbNrkgnewUW6d7PIWgmzFSCNF3apr90TVq/tAuI2gmE9gtZgJGBKMXtsSpdwf4cmMd32xp2F4ATJcOxvnFPuQWhJh/WzUjsh0MujYI5Hb5+hkOK9OHZWKzSKGsPVFnEre1SqnngTeJTgMBkFLHQvQDWmtWVTTHdX+2HS1YV8Pz35UwPNvBIclTueZn2QT8isNP9LPX9NhU6eqswkEO8jNkrYDoMomRQiQgrTWVTj/F9Z4O16hFIuAN9myWRySi+aGsiQXra7d3lE7KT2dseBSLXy1gw3I7QwoMTv2p0Wq7m90xmSAj2QoolIJkq5nxg9Mwy15te6zOJG7JRIPRETu81pXtAKTcsRBxsrXeS6MnGO9mEIlo/ru4lI/W1DBlSCbJK6bx+yfSGDU+xG/vdlI4Or5TIx02M2NyZVNt0S09jpFCiN7V4AmyrsoVk82xdxQMR1i4uZ4PVlVR7QqQ5bBy3LR8fjI2h6/ezOTBO9IZlGfw61ubOepkH9Yu1LhSCqYUZJCXZo/dBxD9TmcSt8e11l/t+IJS6oAYtUcI0UucvhCb69zxbgZuf5gnvt7C8jInh0/Ko+LNybz8ioNjT/Pyy5tcJMU5JikFk4dmSA+m6C6JkUIkCG8wzIZqN7Wu2Ba3CoQMPl1Xy/urq3D5w4wY5GD+gaNJc+diNikGpYY5cF6AYMDF8T/zdivOjR+cJkmb2EVnErcHgZmdeE0IkQD8IYPieg8VTb5erYbVVWEjwqfranlzeQX+kMHPZxcyd688tmZ7mTEnyKHHxL9qJMCIQQ4yHFI1UnSbxEgh4igS0dS4ApQ3+WI+wyQQMliwvpb3VkUTtr3y0zlm6hBUXRbP3pXGNwuS2OegAH98tImsnAinnu/t1n1G5qQwPNvRy60XA0G7iZtSaj9gfyBXKbXj5tnpgKzcFyLBhI0IG2rcVDrjm7BBtGrki4tKqW4OMHFQJsGvp/LdCiuH/aWZEWMMRoyJ79TIbexWM6NzZIqk6DqJkULEX7M/xJKtjTFdxx3Rmg3VbhZurmfR1gb8oQh75adzwvShUJ/J039I5ZsFSaSlRzj/Cjcnnd29ZG2bIRl2xuZJXBJt62jEzQaktpyz4wbazcCpsWyUEKLrVlc2U9Mc31GsRm+QF74rZXFJI0PS7Zw6cjIv31XA1k1WTr/AQyQC5gR6pB2cnoRJpkiK7pEYKUQcBcMRlpc6Y5a0VTT5+HZLA99uqafOHSTJYmL2iCwOGp/L6JxUlIJX37OxaomV83/t5qSzvKSk9qwtdquZiUPSdn+i2GO1m7hprT8DPlNKPaW13tqHbRJCdFFxnSeuSVskolmwvpZXl5ZhRDQ/nV6Ad+koHrgtnbTMCHf+s5FZ+8e/SMrOctOS4t0E0U9JjBQifrTWrCh34g/13uwNpy/E1noPW+o8/FDmpKTBi1Kw15B0TioqoGh4JuuW2Xns5hQOPcbPUSf7Oe4ML0ee7OtxwrbNpPw0LGYp8y/a15k1bl6l1N+AyUQ30gZAa31YzFolhOi0eneATbXxK0KytqqZF78vpbTRx+T8dM6aU4g14OCiS9OYdUCQa//gJDM7MbYj2JHNYiLT0YUSX0K0TWKkEH1sQ427R+vZQkaE4joPm+s8bK71sLnOTaM3uiWNIrr2+YzZw9lnVDbpdivffWHjpv9LYdVSG1mDDOYeF72OLQlsSb0T3/Iz7QxKlc5E0bHOJG7PAS8CxwGXAucBtbFslBCic3xBgxXlTnQc8qJaV4CXl5SxeGsj2Sk2fnHQaKzVg8lJCWFKi/DwfxsYMsxAJehMRBltE71EYqQQfajK6aekvuvryLTWbKhx883mer4vbsTXMlqXm5rEuLw0RuY4GDkohcJsB3brj3P677wpnY/fTCYv3+DyW6Jl/Xu7GnKS1cT4wTJFUuxeZxK3QVrrfymlrtxhashnsW6YEKJ9/pBBWaOP8iZfn26urbVmXbWLj9fWsKy0CavJxInTh/KTEUP4112ZvP9aMtf/0ckRJ/nJH54YBUjakyeJm+gdEiOF6COeQJg1Vc2dPj9kRFhX5WJFuZMfypq2r1WbWZjFrBFZjMlNIc3euqpwMABv/zeZg470k5ahmXusnxn7Bpl7nB9LjAoQTxySjlWmSIpO6EziFmr5b6VS6ligAhgWuyYJIdqzLQjVuPx9WjlSa82Skibe+KGC8iYfKTYzR00ewqET8ihbk8KvT8+gptLEmfPdHHasv+8a1k0WsyJLpkmK3iExUog+YESi69qMTnRWVjT5eGdlJUu2NhE0ItjMJiYMSePE6QXMKMxsNaK2jcupeOu/ybz2bweN9dHjx57uY+8DY7c+O8NhZVxeqkzbF53WmcTtDqVUBnAt0b1p0oGrY9oqIcQuti3GbnD3bZGP9dUuXl5cxuY6D/kZds7fbyT7jMrGZjHx3yccPH5PKvnDDO5+upEpM0O7v2ACyEmVapKi10iMFKIPrKty4faHOzynpN7L2ysqWVzSSJLFxP5jBjF9eCYTBqdhs7Q9ohWJwGN3pfLOS8n4vCZm7R/g9IuambFv78Rai1mRkWzFajYRMiIEw9Fe15E5KQxOlw22RdfsNnHTWr/V8lcncGhsmyOEaM/aKlefJm0lDV7+t6yc5WVOMpOtnLffCPYfk4N5h4RnwtQQx57uY/61bpJTEq8ASXtkmqToLRIjhYi9SqePiiZfu8f9IYOXF5exYH0tyVYzx03L5/CJg0m1t/+YW1VuYkhBBJMJyost7H9YgNMu8DJmYsfJYWdkpVgZnG4n02EjNakzYyRCdM5u/zUppcYDjwKDtdZTlFLTgBO01nfEvHVCCCDai1je2H7Q6k1VTj//W1bOoq2NOGxmfjqjgMMn5ZFkMRMKwr//kYJhKC66ys30vUNM37t/jLJtYzIhlbtEr5EYKURsOb0h1la62j2+stzJM99spdET5PBJeZwwfSgOW9uPt4YB3yxI4pVnHKxaauWZ9+oYPDTC7Q81YeriEjOrxcSgFNv2AlxaQ2qShSEZ9janYgrRGzrTDfBP4HrgHwBa6+VKqecBCUpC9IFaV4ANNe0Hrd6ytd7De6uqWLS1EZvZxHFT8zli8uDtAXDjGgt/uyWdzeusHPlTH1qTkBUjHUlmCrMdaA2BcISQEcEdCNPsC6E1DEpJajVqKEQPSYwUIkacvhBLShsxIrvO6PAGw7z4fSlfbaonP8POTUdPZExuapvX8Xnh/dei69cqSi3k5RtcfI2btIzodTubtCXbzOSmJZGbmkSmw4pKxCAoBrTOJG4OrfV3O/3j7Pk4shBit2qa/aysiG25/7VVzby9opI1lS6SrWaO3GsIR04evL3SVigIz/0jhRceTyE9I8LtDzax/2Hx2+y7PWazYlRLKee21q8ZEU2TNyibm4reJjFSiBho9odYWtLYZjGSleVOnl5YjNMX4pgpQzh++tA2qzKGQ2Cxgsdt4u9/TWP85BAXXtXETw4PYG7jCVgpSLNbSbaaiWhNpCX4ZqfYyElNIkWmPYo468y/wDql1BhAAyilTgUqY9oqIQTlTT7WVjbHLGnbXOfmtaXlrKl0kZFs5ZSZBRw8PneXKSZV5Wb++0QKhx7j55c3ukjPTLy1bA6bmZkjsjqcnmI2KZkiKWJBYqQQvSgYjs6SWFHu3GW7m5AR4blvS/hyYx1DM+xcdshYRuWktDpHa1j8tY3Xn3cQDMBfHm8iJy/CE2/WM7Rw121qlIL8jGRy06KjaFKWXySyziRuvwIeAyYqpcqBLcBZMW2VEHu4rfUeNlS7Y3LtkgYvby6vYGlJE6lJFk6fPYxDJ+S1ClZ+H3zxoZ15J/gZPsrgX2/WkT+sD/cf6KIxeamypkDEi8RIIbqhxuWn3h0kGI4QNCIEQhGChtHuVjchI8JDn25kVUUzR00ewolFrUfZPG7Fh6/beeM/Dkq3WMgcZHDsaT4ikehUyLaStkyHlQlD0nbZy02IRNVh4qaUMgO/1FofrpRKAUxa69gvthFiD6S1pro5QEmDl2Zf7xf82Fzr5q0VlSwvc5JsNXNi0VDmTRq8S8Lzw3dW7vldOhUlFkaNCzN2Ujihk7ZUu0VKKou4kBgpRPc0eYOsLHd2ej/SYDiatK2pbOa8/UZw4Ljc7ce2rbd+/9VkHv1LGhOnhrjxTicHHenH1s72aDaLibF5qQzNTO6FTyNE32k3cVNKWbTWYaXULACttafvmiXEnqW0wcvWei/+0K49gj3hDxks2trIVxvr2FDjJsVm5qSioRw2MW+XKZEup+Lxe1J552UH+cPD/PVfDYydlPhLdUbnpuz+JCF6mcRIIbrHHzJYXtb5pC0QNnjo042srXRx3v4j+cnYHIJB+PJDO2+9mMyRP/Vx5E/9HHGSj8kzgkyY2n7cMpmgMNvByEEpst5Z9Esdjbh9B8wEliql3gBeArYHJq31qzFumxADnj9ksLqyudf3Z6t0+nhnRRWLSxoJhiMMTkvitFnDOHh8bptTCiMRuOa8LEo2Wzj1fA/nXe7G3g86ItOTreSlyWibiAuJkUJ0kRHR/FDatH0T6t0JRyI8smATaytdnH/ASEZaBvPPu5N5/7VknI0mhg4PY22Z5ZiarndJ2swmRZLFRJLVhMNmYeSgFJJtMq1e9F+dWeOWDdQDhxFdfK1a/itBSYgeqHH5WVPpItTJANYZjd4gb/5QwRcb67CZTcwZlc3+Y3IYk5vSZtnimkoTg/IimM1w8TVuBuVG+sUo2zYy2iYSgMRIIXZDa40/FGFjjRuXv3MxRmvN019vZWV5M+ftP4IDxuTwq9Mz2LjWwn6HBjj+DB8z5gR3KeVvNisGp9kZmhndAFuIgaSjxC1PKXUNsJIfg9E2iVdWToh+ZGONm+K63ptZ5Q2GeXdlFR+tqSaiYe7EPI6dmt/ugutwCF79t4N/P5LKRVe7OOksH/se1Lujfr3NZAKLyUTIiKB1dFF5jlSJFPEjMVKIdkQimjpPgJrmAC5/GF8o3Ompkds88U4d77yci948g5k/dQKaq25rJisnQk5e2xcbk5fK8KxkmQYpBqyOEjczkErrYLSNBCUhusGIaFZXNFPd7O+V64WMCJ+uq+Ht5ZV4ggb7jsrmpKICctPaT2hWL7Ny/+1pbF5vZc4hAfY/NPZ7sqUkWTCp6OcPRzQ2i4k0u4V0uxWtYX317us5jM1No3CQA601gXAEk2x8KuJLYqQQOwkZEdZVuah1B9rcf213fF74/H07LzxnoWzNYJQ5wgGHBnG7FKnpmnF7tT1apxTsNTSd/Ix+MMdfiB7oKHGr1Fr/vs9aIsQAFwhHF2Q7vT2vGBnRmu+3NPDq0nLqPUEm56dzysxhFA5ydPi+5/6ewlMPppI7xOB39zdxwNwAsc5/RuWmMCY3tcNzwpEIm2vbH4HMTrVt/2xKKSn9LxKBxEghdhA2IiwtaepyVWStwetRpKRqVq0Pc9dvM7Bku9nrxGJuvcpBTl7HCaDJBFMLMjvssBRioOgocZPubCF6KGREqHNHp4s0eIIYkZ53xK+vdvHfRaUU13spzHZw3n4j2WtoervnRyLRqZG2JJgyM8hpF3g455ceklNiOyhgNismD03vVPGQ0bmpeIMGVc5dRyKtFhN75bf/+YSIE4mRQrQwIpplpV1L2mqrTHz4hp0P/pfM8HEBRpy2nG83N1B4UTHHHZLMsdPysVl2jVMWc7TzzmYxYTObGJaVLGvZxB6jo8Rtbp+1QogBqN4d4Ieypi7P62+PP2Tw7Ldb+WZzA1kOKxceMJI5owd1OGVw/SoLD96RxuQZIS69wc30fUJM36f394jbWbLNTNHwTFKSOlP/KGqv/HT8IYOmnUYkJ+WnyQibSEQSI4Ugup7th7KmXX53t+frT5J4/flkln5jQ2tFwUQXWxybqN/axBGTB3PU5CHtrs8uyEpmXF6qrGETe6x2n6q01g09ubBSyg58DiS13OdlrfXvenJNIfoLX9BgRRc2F92diiYfj3y2iepmP8dNy+foKUNIsrSfzDQ3KZ64P5V3XkomMzvCT8/y9U5DOsFuNTOzMKvLJZdNJsW0YZlUN/sJhA38oQh2q1nK/YuE1NMYKcRA4A6EWVPZ3OESAMOA5d9bmTorhMUKa5ZbqSixcORZDVTmraPJ2sicEVmcsfcUstoZOXPYzEzMTyc7RUbWxJ6t893hXRcADtNau5VSVuBLpdS7WutvYnhPIeLOaOl9DHdjYXZbvt1SzzMLt2KzmLh23ngmDul42uC3n9n4y00ZeDyKk87yct7lHlLS+qZWQpLVxMwRmd3eJ8dmMTE8u+N1ekIIIeIrEtFsrvNQ0uBps4NSa9i8zsInb9v55G07ddVm7ni0kX0PCnLmL1wMOmgdb62sJDc1iav2HceUgox275WebGXWiCzMJpmdLETMEjettQbcLV9aW/5IpS0x4K2pbMbdyX1qOhIyIrzwfSmfra9lXF4qvzhodIfz+ENBsNqgYITBhKkh5l/vYtQ4o8ft6CybxcTMwiwctlj2BwkhhOhrISOC2x/GEwzjDRrUuQJ4g23Hl9oqE7/5RRbFGy2YLZq9fxLk0htcFO0TxBc0+NfCLSwra+KAMYM4a98R2CztT3tMspqYNixDkjYhWsT0CUspZQYWA2OBh7XW37ZxznxgPkBhYWEsmyNEzG2t97RZYKOrqpr9/OOzTZQ2+jhq8hBOmjEUy867jLaoLDPxz7vSiGi47X4nw0Ya/Pmxph63YXfMJkVuWhKpSRZSkiykJ1s6nL4phBCi/6lp9rOmykUo3Pbc/8Y6E5+9H63oeNJZPgblRSgYGeaEn3s56Eg/GVnRPvsqp5+HFmyktjnAmfsUcuiEXFQHa7RNJpg2LFPWOAuxg5gmblprAyhSSmUCrymlpmitV+50zmPAYwCzZ8+WETnRb9W4/Gysce/+xA5orflmcwPPfrsVi0nx68PGMm1YZpvnej2KFx538PJTKZjNcMZF0Skr7eR3vW6voekMTpf1Z0II0V95AtERNH8o+iclyUKWw0ayzbx9T7a2OiOdjYovP7Sz4L0kln9vIxJRzD4gwEln+TCZop2I22it+XpTPc9/V4LVbOKaeeOZMCRtt22blJ9ORnLbRUqE2FP1yZwmrXWTUmoBcBSwcjenC9HvNPtDrCpvRveg66HBE+TZb7ayvNzJ2NxU5h80ut2F2KuXWbn9ygwa6szMPd7HxVe7yRncS5VQOmFkTookbUIIkeA8gTCVTh+5afZWSVCNy09xnbfd8v12qxmNJhD6Ma64nNFNsJWCx+9J471Xkxk2MszP53s49Gg/I8buOnXSGwzz7DclfFfcwITBaVx84Kh2C5DsaHRuimymLUQbYpa4KaVygVBL0pYMHA78JVb3EyJe/CGDH0qbur1Hm9aaLzbU8dLiMgytOWP2cOZOzMPUxpx+jzu6SemwkWHGTQ5z9qVNTJzW8/V0XZGTlsSY3JQ+vacQQoiuqXcHWFHuJGxoiuu8OGxmctOSqHMH8QQ6jhv+UDQJczYqvvrYzhcfJLH0WxsPvdDA2ElhTr/Qw4lnehkzMUxbsx29wTBfb6rng9XVNHmD/HRGAUdPHtJmXNvZ+MFpFA6SIlVCtCWWI275wNMt69xMwH+11m/F8H5C9LmQEWFZaVOrXsmuiGjNf74r4dN1tUwcksZ5+40kNy1pl/O2bDDzz7vSaKw38fB/G0jP1NzxSFMPW991jiQzU4amd7guQQghRHyVNnhZX+1qNQvEGzTYWu/t1Puryk3c83/p/PC9jYihGDIszCnneknLiMa64aPaLkxS6fTx4epqvtnSQDAcYXROCr84aDRjclN3e0+lotMjh2bKSJsQ7YllVcnlwIxYXV+IeAuGIywtaex2BUkjonnq62IWbq7nyL0Gc+qsYbskRHXVJp5+OJUPXrPjSNWc+QsPEaPv1rHtKDctiYn5abLxqRBCxJk/ZNDsDxEMR6J/jAhhQ2NENEEj0uG+am2pLDXz5cdJZGZHmHeCn8zsCM1NJs64yMNBRwTaHVnbptYV4M3lFSzcXI/FpNh31CAOnZDLiEGdm51hMsGUggzZt1OI3ZC63UJ0QyBssGRr026nm7QnZET45xebWVLSxElFQzl2av4uSdvqH6zccGEWRhh+eraXM3/hIT2z7+v3WMyKCUPSZL2BEEIkgOpmP6srmzF6uFfoprUWvvwoia8+TmLL+uj6t8OO9THvBD/2ZPj7K7vfY94bDPPa0nI+31CHAg6fNJhjpgwhzd61oiKT8tMlaROiEyRxE6KL/CGDJSWNeAPd2yOtxuXniS+L2Vjr5md7D+fwSYO3HwsGoKzYwugJYcZNCnHsaV5OOttL/rC+KzyyjVIwJMPOmNxUKccsRC9SSg0HngGGABHgMa31/fFtlUh0kYhmXbWL8kZft94fCsKG1Vb2KoqOxj35QCrff2Fj8swQl97gYv+5/i7FmhXlTp5ZWIzTF+LAcbkcNy2/U4VHdiaFSIToPEnchOiC6mY/66pcBNvZz6YjWms+W1/LS4vLMCnF/ANHs8+obACMMHz4hp1nHk7FMODf79dhS4Jf3tSz7QW6QynIz0hmVE4KyTZJ2ISIgTBwrdZ6iVIqDVislPpQa7063g0TickbDLO8zNnlqfnNTYpvP0/imwVJLPrShtdj4j+f1pKTF+HSG1xc+4cIWYO6NnLnDYZ58ftSvtpUz9BMO786ZCwjc7pXsGpoZjKjO7H+TQgRJYmbEJ0QCBusq3JR0xzo1vudvhBPfLWFVRXNTMpP44L9R5GdYiMSgS8+TOLph1Ip3Wxh4tQQF17lwrZrfZIusZgVQzOTKW/y7XY6TZrdwuB0O3arGbvVRLLNLBtpCxFDWutKoLLl7y6l1BqgAJDETeyizh1gZUt1yN3RGiIGmC3w5UdJ/OHqDCIRRXaOwcFH+ZlzSJD0lgIjw0Z2fdbImspmnvhqC05fiGOmDuH4aUOxdmPds1IwON3OpPzd7+cmhPiRJG5CdGDbHjhljb5OBc22bKhx8ffPNuMLGpy1TyGHTMjdvp5t+fdW7rgmk8LRYX53fxMHzA10uAB8Zxazwojo7ZXDlIr2YI7JTcVmMVGY7WBtlYs6164Jp81iYkxeKkMz7FIlUog4UUqNJFrI69s4N0UkoC11HjbXujvcI9TnhaXfJPHdFza++zyJs37h4djTfUycFuLMX3iYc3CAcZPDPSpqFQgbvLqknI/X1jAk3c7NR49lVDdG2RxJZgoykxmSYZcOQiG6QRI3IdpQ5fRT2ujtcmWuHWmt+XhtDS8tKmNQqo2rDh/HsEwHS7+1UVVq5pjTfEzfJ8QfHm5k7wODmLsQw5SCwmwHY3JT0YAnGMbtD5Nqt5C+w6Jwu9VM0fBMqpx+6twBTEphMkGSxczwrGSpEClEHCmlUoFXgKu01s1tHJ8PzAcoLCzs49YNTEZEs7nWjTdoYLOYsFvNZDtsZDi6VkyjL6yvdlHSQfn+UBB+e1kmKxbZCIUUyY4IM/cPkj88OpKWkxfhvMs9PWpDvTvAd8UNfL6hjlpXgLkT8zh5ZkG3kq5BqTZmFGb1qD1C7OkkcRNiB/6QwZrKZurdwR5f5+mFxXxf3EjR8EwuPGAkG5cnc++DqaxYZGP4qDBH/tSH2QJzDunavRw2M5OHZrR60Ei3W1slbDsbkmFnSIZU7BIiUSilrESTtue01q+2dY7W+jHgMYDZs2f3fUnZAcbpC7GqwrlLYaktJpgwJJ2CBNo/rKLJ1yppczkVS7+xsegrG0rB1be7sNogLUNz4lle9j4wyNSZQaxdrw2yi0DI4PviRr7eXMf66ug669E5KZyz7wj2GprerWvarWamFGT0vHFC7OEkcROC6OhYeZOPDTXuHpdYrnT6eHTBJiqb/Zw8o4AJtmH87tI0ln2bRHauwa9+08wxp0aTts5SCrJSbAxOtzMk3Y7ZJFMbheivVHRu8r+ANVrre+LdnoHOHzIoa/Sxtd7T5pTDSATWVDTjCYQZl5ca96njTm+ItVXRAdj3XrXzzsvJrFthJRJROFIj7H/Yj1Pff3u3s9v3iUQ03qBB0IjuBdfsD/Htlga+3VKPPxRhSLqdk4qGss+o7B6V6jeZYOqwjG6thRNCtCaJm9hjGRFNvSdATXOAOneg22vYdrR4ayNPfLUFm8XEFQdNYNqINLasV5RstnDpDS6OO8NLUhfin8WsGJWTIusBhBhYDgDOAVYopZa1vPYbrfU78WvSwOIOhKlu9lPrCnS6EmNJvRd3IMzQjGRSksyk2CyY+qiTTGtYuxbefT/C/96OcP2fIdkBtVXR3/tn/sLDrP2CTJoe6lKnH0T3DS1p8LKlzkNxvYc6V5AGbxCnN4SxUyZrNStmj8jmoPE5jM3tnSR2/OA0MpITbyqqEP2RJG5ij6O1pqzRx+Y6D6FulPVvz5KSRh5ZsImsxmEEF03gre810/7WzKjxYZ77sA5LF+KWUjAsy8Ho3BTppRRigNFafwnIsHk3NHmDuPxhXP4wnmCYJIuJzGQbmSlWbGYTNc0BKpy+LpfN36bBHaShZaq8UpBmt5KfYWdwuh2bpfd/F69YAX/9K3z8MVRWApgYOtxMTYWZEWMNzv6lh3Mu69o6tUDYYGONm3VVLtZVuyiu92JEoglalsNKXpqdCYPTyEqJTrFPspiwWUwkWcyMy0slJal3Hg1NJijMTmFYlqNXrieEkMRN7GFqXQE21Li6vXl2e6qcfh58ykPzNz+hZGsGg/IM5h7zY7DtStKWarcwtSCj14KnEEL0d03eIBtr3DS1UTCqu9u07I7W0OwL0ewLsaHGRXZKEjaziYiOVvLNTUvq0trhykpYsAA+/RROOw3mzYNAAN5/Hw482GBskZcpe7feBLuzA14hI8LyMiffFTewvKyJkKExK8XIHAfzJg1mTG4Ko3JSyOzGBtldZTJBQaaDEYMc2K0yU0SI3iRPhmKPEDYirK1yUeX09/q1g+EIt/4hRNm7s8gZEubXtzZz5E993dqLbVCqjakFGVLtUQghAF/QYH21i9o2tjTpS5EIu2yrUuv2k2w1d1iR0ueDa66JJmvr1kVfS8/QTJ4W4dDDFLNmmfhmpYct9R2X/G+L1priei9fbKjl++JGfCGDNLuFA8fmMm1YBmPzUmOeOFnMirF5qThsPz5OOmxmSdiEiBFJ3MSA1+wPsbLMiTfYe6NsoSB8/JadghEGiwMbCI128bPrUjnvbNWl0bUdFWQlM3FIWtwXxgshRCKodwdY0cmNp+MhEoHl5U3sMyqbJIuZrVvh88/hiy8gNxf++Eew26OjbGPHwrnnGxRMcjFkdACzGRasiyY+Xf18YSPClxvrWLC+lrJGHzaziVkjspgzOpuJQ9L7rHhVVoqNyUPTJUkTog9J4iYGrEhEU9LgZXOdm0gvLWXzehTvvJTMK884qKs2M+3wOpyz6jhpv3xOmtH9YDkqN4Uxuam900ghhGiD0xvCG2p77ZfW0el2ISNCMBxNJKxmhdmkSLKayXJYW42qxNrWeg8ba7o+CtXXAqEIv7wyyMdv2SkpicaAjAw444zocaVg9Wqobvazpqp5l6rFXUnatNYsK23ipcVl1LgCFGY7OHvfQvYZld2nPxuAcYNTGTGo6xtwCyF6RhI3MSA1eoKsrXLhCXRvgXpbXnnawbN/T8HdbGLCDB9Djl9HU3Y5M4ZlcsL0od2+bn6mXZI2IUTMVTb7KGvwdfv9dquZrBQryVYzFpMJs1nhsJrJSum9dVP+kMGGajfVzb0/rb0nggFYv8rKqqVWVi6xUbLZzJNv12MygT9sMHFqmGuvtXDwwYopU8DcMggVCEc/T0+n6Vc0+Xju2xLWVbsYmmHnqrnj4rYv2ujcFEnahIgTSdzEgOIPRatp9dZattItZoYUGNs3NZ26d4DUvTexXm/Fnmxl/qxR7DMqu9vTG7NSrEwa0r0NTYUQoi/5QwaVTbtOOc9wWBmdk8Kg1B8X9mqtCRm605UYQ0aErfUeSht82ysgxlN9rYn0jAhWG7zxn2QevTONcDj6e37YyDDTZofweRUpqZoLr/QAHmwWExnZDjTJaK0oa/Sxqdbd46me27aZsZpNnLVvIQeNy43bXp556UmMlo5GIeJGEjcxIISNCMX1XkobvD0O+lrD8kVWXn4qhW8WJHHDn5zMO9FP0TE1fJ+2mU2eEEdNHsJx0/J7NLffYTMzbVhmn+0TJIQQseD0hlha0kSmw4rVbMITDOMPGUQi0VG6jGQrGclWHEnRohVJFhNmpfAEw9tL+1c6fXFbyxYKwqa1FtYst7LmBytrfrBRVW7m7qcbmDY7xJiJYU4+x8teM0JMnhEkM7vtdgbDETbWuCmu92C3mru9JcE2Ea1544cK3lpeyaicFC47ZAxZfVAVsj2pdguTh8ZnlE8IESWJm+gXIhFNIBwhEDbwh6LrMCJaE9FgRCKUN/l7vCdbJAIL3rHz8jMONqyykpEV4ZzL3Mw6IMD7q6p4dUk5mQ4rNxw1ocdTG9PsFqYOy5A92oQQA0Zbpfr9IQN/yNhl6qNSxGX9mtZQVW5izQ82CkeHGTspzIbVVq48KxuAnMEGk6aHOOksL0MKoqOLk2eEmDxj18/WnrChcRs9S9oCYYN/fr6FZWVN/GRsDmftWxjXeGG1mJg+LDNuI31CiChJ3ERCCBkRPIEwnqCB2x/GHQgTCBmEIhojEum14iJtCQbBZos+SLz4LwehsOLK3zUz7ZAmVlQ38MDCBsoafcwozOT8/Ub2aH+1VLuF0bkp5KV1fu8fIYQYaPoyaQsG4YV/prB2hZX1K604G6MJ0OkXehg7yc3YSSFuvaeJSdND5A6JYbDpJH/I4IFPNrCh2s3P9x7OYRPz+rzacE5aElMLMjAppNKxEAlEEjcRF1prqpr9VDn9LUla3wfLLRvMvPZvBwsXJPHUO/WkpGr++I8mmnQzz39fwv/e8wIwOieF8/cbyQFjB/UogEkVLiGEiB2XU7FhtZX1Ky2sX2Ulf5jBJde5sVrhzRccZA4ymHNIgAlTQkyaHmLk2OiomC0JDjoyvvvEbeMNhrnvow0U13u4+MBR7DtqUJ+3ITvVxrSCDJnGL0QCksRN9KmwEaGiyU9Jgxd/qPf2VessIwxff5rEG887WPadDVuS5vDjfQT9imRHhG+qynjjhwoyHTZOnz2M2SOyye6FimkFWcmStAkhRC9xNiqqK8yMnxxNvn7zi0y+//LH4ij5w8PkD4vGGKXguY9rscVveVinuPwh7v1oA+VNPi49eAwzC7N6/R7pyVYiWre7/i7TYWW6rL0WImFJ4ib6jC9osLS0EW+g7xM2raPBu2Szhd9flUlevsFFV7k45jQf6Zma8kYfj32wlQ01bvYZmc3Zcwp7bV+cTIeVCYPTeuVaQgixJ1q9zMrir21sXGNh4xorNZVm0jIivPJVLUrBPgcFmDY7yPgpYcZOCpGe2XouZqInbZtr3fz98824/CEuP3QsU2NQ6n9kTgpjclNQShEMR2jyBmn2hzEimnDLeoQJg9NkHZsQCUwSN9EnnL4QP5Q2EexhAZGu0BrWrrDwxn8cWKxw7e+bGTU+zN+eaGDqrBDVbh8Ltjay6PMGKpr82K0mLvrJKOb0oLz/zuxWM1OHyZQTIYTYnXAISrdY2LTOwqa1Fjavs/D7h5pIssOXHyXx8lMOho00mDwjyElnhRkzKby9U+6ks7q/P108aa35aE0NLy8pI8th5YYjJzIqp/uzM5KsJsYPTiMQilDj8tPkDWExKyYPzSA37ccRSZvFRF66nTzZjUaIfkUSNxFzNS4/q8qb+2xvHp9H8ek7dt58MZmNa6w4UiIcfaqPSERT1uij2NbIK283Uun0o4iuPTtzn0JmjcgiI9naa+1IT7YyMT+NJEv3twwQQoiBqLHOxOb1FsbtFR0d+/ANO/fcmr59rzSrTTNqXJimBhODh0b42cUezvmlh+SU+O/x1lvq3QH+830py0qbKBqeyQX796z41bDsZMbmpmJpqT5ZOMixfUlCT7auEUIkjpglbkqp4cAzwBAgAjymtb4/VvcTiSMS0TR4gzR4gtS5A302NXJbz+vz/0zhhX+mMHp8iF/f2syceW6WVtfyuzfrosmagvF5aRw6IY+ZhZlk9uK+OGl2C8OyHeSk2iRhE0Ls8bb9Xq4qN/HqMw62bLBQvMFKU0M0ubj9wSb2PyzAqHFhTj7Xy5gJYUZPDDF8pIF5hyeUnac+9mf+kME7Kyv5cHU1AKfPHsa8SYO7PdPDYTOz19D0NmOZJGxCDCyxHHELA9dqrZcopdKAxUqpD7XWq2N4TxFnzf4QK8uceIN9k6z5vPDZ+3beeSmZc37pYe8Dg5zwMy8zD/Bi5NazpKSRW99tJBzRjM5J4Zw5I5hZmEmavfdG1raxWkwUFWZKwiaE2OMYYaJJ2UYLxRssbN1ooXiTmdMv9HL8GT6CAcW7rzgYMSbMnEOiidqo8SHGT4kWyRg7KczYSe44f4rYChkRvthQx1vLK2j2h9l3VDYnzyhgUGrS7t/cjoKsZMbl/TjKJoQY2GKWuGmtK4HKlr+7lFJrgAJAErcBqqzRy/pqV0z3XNtm/SoL77yczKdv2/F6TAwfHcbtM/hiQy1LS5tYXdFMeI0mxWbmoPG5HDQuh2FZjpi2aZJMixRCDHA+j6Jki5mSTRZKNlsYOS7M3OP8+P2KX54aLV1vsWiGjTSYMCXM4KHRTrzhowxe/64G0x6YX2xL2N5dWUmjN8S4vFQuP3QYo3NTu33NJKuJiUPSW61bE0IMfH2yxk0pNRKYAXzbF/cTfcsfMthY46bK6Y/pfcIhsFjBMOC2X2fS3GRi38O8DJtTSU1KOc9Xu9FVMCjFxiETcpkxPIuxeal9UiFraGaybKothBgQtIaGOhOlm81EtGLmnCAAl5yUTfGGH2crWCyaY0/3Mfc4PympmtsfbKKgMEzBCAPLTpMalIr+2ZNorfm+uJGXFpduT9guPGAUE4ekdWtapFKQk5pEfqad3NQk2RhbiD1QzBM3pVQq8Apwlda6uY3j84H5AIWFhbFujuhFLn+IrfVeqpv96BgtPzAMWLLQxgf/S2bND1aeeqeO5mCQeVdsZKtRy1aPk60eKLAmc+zUfGYWZjE8K7lPA1qyzcz4wd3vORVCiHjw+6Ch1szQwuio2JP3p7D46yRKi8143dGhsQlTQsx8sQGAA+cFOPQYP4WjDUaMie6TtmOCtv9hibGJdSIobfDy/HclbKhxU5jt6FHCBpCVYmXy0AxZsybEHi6miZtSyko0aXtOa/1qW+dorR8DHgOYPXv2wFl9PID5QwbrqlzUumIXpGsqTbz132Q+eiOZ2iozqekRZhzm5MEPtrC6oR6tYeQgByePL2DmiCyGpPf9aJdS4LBZmJSfJusLhBAJ76N3Lbz/fhrlW82UFVuorTKTnWvw4oI6AFxOEylpEQ4/PsTwUWEKRxsMH/XjRs3n/soTr6b3CyEjwvIyJ99sqWdZaRMpNgvnzhnBT8bm9GhLmOHZDsYPTpURNiFETKtKKuBfwBqt9T2xuo/oO5GIpqTBy5Y6T0xK+7ucinBIkZUTYf0GeOHxFEZMb2bMcVUECkop8QdI81g4evIQDhyXG7e5/cOzHeSlJZFmt0jCJoToN7772sxn71opGBlm+j5BCgqjidm2yo+//j9XvJvYL1U3+/lwdTXfFTfgDRr/396dB8l9l/kdf39/Z9/H3Ic00uiwdcvIsmxsMMYOxmYJhoWEBYJTZGvZzeLN1laytQSyS1Kbym6Oqmy2NskWBYRasuBUgQGbmIUlGB/4wAeybNmWbGRJ1n3OPX1/80ePjJBa1kiaPqb786qamu7pnumnH41+zzy/3/cgFfF477oB7twwcEXL+zsOXD2QYjgTXcBoRWQxq+cVt5uATwIvGGO2z33tc9baB+v4mrKAKhXLZL7E9NzHiakC0/nSxb/xEhQL8NQjAQ/c5/P8T2OM3nyE1C07OTFVYOh3QmwyTzERMJKN8evLhrh2WRa/Sc2S6xjWDqYYSGsum4gsPn/4J3nu/oOxZofRNt44NcODLx7mmX2ncY3h2mVZ3r6im7WDqSueWx0PPdYPp0jVYQVkEVm86rmq5GOArusvQpWK5eDYLK+fmKZQqs8Skccn8/zFn0X4+Y+yFKd9nFie+OZ9eGuOsKI3zm1r+xjtjrMkGyMaNH9Mf+g7bFqSWdANukVEGslvo8NXxdo39wo9PVPk9HSBsdkiU7kSk/ki0/kyscClLxnSl4yQjflYoGwtlYrFcx0inkPEd4kGLtlYQDLi4cwNRyxVKkzlSkzkqicuZwplpvMljk7mODyW49D4LCemCkR8h/euG+A96/oXrD6MdMdY1Zu4ouGVItKeGrKqpCwOlYrl6GSOPcenma3DPmwvvGC573sl7PrdHBib5eT+DaRWVrjm3eO8690V1gwnyMZWLPjrXolU1Kc7ETCciWpSuIhIg52eKXB4LMfp2QJjM0VOTuU5cHqWg2Oz5M85sRj1XZIRj0TokYn6TBdKPLd/jKl5jhRxjSEV9SiW7QW/x3MM/akIK3oS3Lomxk0re65oOOTZQt9h/VCarvj5G2mLiIAat45n585aHp3Ic2wyR6m8sHPXDh9w+N63PX74QMjYwRgYy43/+iD/eGs313wI+pIGyCzoa14px4FVvUkG0hECT3PYREQaqVKx7Dg4zk92H2PnwQnOrkqJ0GMoE+GmVT0syUTpTYZk4wHZqE94gZNrM4US47NFHGPmPqBUseSKZXLFCjOFEqdniozNVK/a+a5DKuKRjvokIz7x0CUeeMQCl0wsqMsWM92JgPVDadUcEXlLatw60NhM4c0iNT5bXPBmzVo4PpXj2/dX+OZ/GAUgMnyK6z5+jH/6CZerly9f0NdbSNHAZeOStOYViIg0mLWWp14/xX0/P8ip6QLpqM/7Nw2yZiBFJuaTifmE3qWPfIgFHrGgNf/cMQZW9CYY7Yk3OxQRWQRa80gmC65csRwen2X/qRlm8gs/DPLEUYeH/i7gwfs93NHDsGkXlYLLyB0l3vXePHe9M0M62torY/UmQ9YNpZq2+ImISKeamC3ytaf28fP9Y4z2xPno1qVsXprGc9r7eLx2MMWQVo0UkXlS49aGpvMlJnMlCqUKhXKFQqnC8ak8xTosNPLdr0f50fdCXnm+ujS/3zvB6KYiH7puKZuWpOlLtvYKjIHnMJCO0J+KaOEREZEmeHbfab725D5yxTIf2bKE29f1d8TCHEu6omraROSSqHFrA5WK5dhkfm51rQL5Yn1WggQ4tN/l5R0+t70/x1SuxHe+azl2LE/mHXvZcss0H741xeq+bFM2CnUcCFwXzzUYwHGqnz3XwXMMnmvwXYfAdQg9h9BzSUU9bWoqItIE0/kSX//Zfp56/RTLu2P8s5tGO6aRycR8rupLNjsMEVlk1LgtYoVShYNjsxw4PVO3Zs1a2LPL4/Efhzz29yF7dvs4juV4Zi+P7j+EczvctTrDP9w8yGA6U5cY3koi4nF1f1KbYYuILCIvHhznq4/vZTJX4q7NQ9y5caDth0WeEXgOG4bTHXFVUUQWVls1bqVydUig5/zyCovnOLiOwXNMSx0kz6xyNTFbYipfJF+qUCxbSuUK1lYnLDvG4DqGwPvlFaKKteRLZfLFCrlSmUod+rVyGSpl8AP4/rei/NcvpDDGsu5tRW771CHeSL3Kj/ZMce1Ilg9sHmI42/gzpK5jGO2Js6w7pitmIiKLgLWW3Uen+MHOI+w4OM5QOsI9t65ieXdnLMzhuYaueMCy7ri2lxGRy9JWjVuxbNl5cOKCjxtT/YPfdQyuqTZy1eYIjJkbXje3XPDFegE7txCjxZ51u7opaKViqdgat231uWc+X+znl62lXLEUShWm5p2FyzM7bXjm8YAnHgp56uGQ3/7DSW7/YI5t78zz258/TWTlEZ44fJDXJvJc1Z/gH127tmmrYGViPuuH0i2xMbeIyGKw++gkf/vkPiZma+9PZqkuYlWuWMrW4jmGiO8S8RyigUsi9EhFfJIRj2wsuKQToaVKhef2jfHDl46w9+QMyYjHXdcMccf6gbZfDMpxYGk2Rn86QjLU0HwRuTJt1bhdjLVQKtsFX/5+MSvk4d/+iwzbnwooFg2JVIXrb87TPVTgoV3HeHrvKV4tTWF3wXAmyu/duopNw+mmFB9jYLQnzmhPXMVPROQS7D46ydee3P+WzzGmugm16xhKc01cLVHfZXV/gqv7k6wZSLI0G6vZyE3mijy8+zg/2XWcsdki/cmQT96wjLev6O6I/coG0hFW9SV0dU1EFkxHNW6drlyGV3b4PPVIQLlk+K1/OUUQguvBBz42w/W35ImPnOaxPcf58qunKZQrDKUjvH/TIFuXdTGUiTStYYoFLuuGUmRiQVNeX0RkMfu1jYOs+P04B07NXvA5zjnH92K5Qq5YZrZYZjJXXa14YrbI3pPT7Do6yY4D4wBEfIeVvQlW9yUAODKR4/B4joOnZylVLOsGU3zy7cvYONQZ87pc17BlaZZ0TCsVi8jCUuPWAZ5+LOD/PRDh6cdCJsYcHNey7R0FoDo05hN/vJ/tb4zxf94Y48SuAqHn8PaV3bxzdU9T5x4kIx69yZDeZEhSG2KLiFw2MzcN4Nzm7K34roPvOiQjPmcvgHgzvQCcnimw+8gku49N8eqxSb6z/RAAXbGAwXSE29b2cdPKno5ZKRKqVy03DqfVtIlIXahxazPlMuze6fHMYyEf/c1pghB2PB3w7OMh19+cZ9vNeTZsm2Xv1BhfemyMHQfGmSmU8RzDusEUv7ZxkOuWdzVlaEfoO3TFA7rjIdm4T+hpeImISKvKxgKuX9HN9Su6gery/u7c3LhOtbovSU8ibHYYItKm1Li1gYkxwxMPhTzz05BnHw+YHHcwxnLtTQXWbS7ysU9P8Q/uPsYrRyZ47tAE935/klLFEg9cNi/JcM3SDOuHUk0ptrHQpS8ZoTcZagNsEZFFLB529p8Uw9koI92xZochIm2ss4+yi9TsDLz4bEDvYJnlq8oc2OvxX/5Nmq6eMjfckufam/IMr5vgUH6cRx6eZPfRSSZy1ZXEhtIR3n11H9cszbCqL4Fb5/kGvueQjvp4c6t5+q4hGngkAo9Y6Lb9imIiItL+UlGfNQPaUFtE6kuN2yJgLex6weO5JwOeezzkpe0+xaLhg/9kht/97AR9K6b5V/9zkkL6FHtOTPPd41PMPFoGIBvzWTuYYu1ginWDKbri9V3cw5jqimNdiYC+ZIRszNcKkCIi0ta02rGINIIatxZkLez7hcupEy5bbqguIvLHn8kwdsplaEWOTXccJzF6khN9x7jnGzPkS7/chXsoE2HrsiwrexNc1Z+kJxHUvZh0JwKWdsWIBS4Rz+2IVcNERESgOuS/N6l5bSJSf2rcWsTB/Q6PPuzwzBM+r26PMjPuE+vOse2zz3NqukDkzgRLstO48QIngFLUZzARYfVwD4PpCAOpCCNdsYbOMYiHHlf1J+jWRGwREelQy5q4+rKIdBY1bg1grWW6UGYqV2IyV2QiV+L1Xxh2PR/Sf+0RTkzneeZ/r2B8+1LcRI7IyDH633GKgbUTRHyHjcNpsqt9uuIpBlIRhjLRpk4CjwUuy3riDKWbt6+biIhIswWew2Aq0uwwRKRDqHG7AtZaZotlTs8UGZspvPl5bKZY/ZgtMDFbYjxXJD8eMPtaP7k3usi/0U15qnqgXxM5xJJRuO2jp+j/rVmuWe8xkA6J+r0Y09fkd/irEhGP5d1x+lOhGjYRkStgjLkD+G+AC3zJWvvnTQ5JLsPSrpimB4hIw6hxq6FiLVO5EmOzRcZnq81Y9XOR8VyRibmvT8yWKJQr531/zPPxTnVRPDjIii2TXL+uxPGdXXzrh6Oku0tsuD7HNdtOs+3GEiOjK2n1HigV9VneE6MvqbOKIiJXyhjjAv8deA9wAHjaGHO/tfal5kYml8J1DEuynbO5uIg0X0c1bpWKZTJfYnzuatjYbHHu9q82aBOzJcrWnvf98cAlHfNJR31W9iZIRX2yMZ9MNCBSCXn0293seTHCrhcC8rlqN3bbhgk+uGWW3Fr4wLtOMLi03PKNGlQLUibms7Qrps1ERUQW1jbgNWvtHgBjzL3AXYAat0VkMBPRljYi0lBt1bi9fHiC5w+M1WzGxmaKTOSKVM7vx0iEHumoTybmM5SJkon6ZGIB6aj/5tfTUR/fdbAWDu5zeWm7z0vP+/ijZW6/e4ZCAT5/b5KlK8q87yMzrN9SZMOWIt291StykSgMjZQbnJH5M6aah+5EQFc8JBP1NfxDRKQ+hoE3zrp/ALi+SbHIJTAGMjGf3kSEwYxGoYhIY7VV4/bP//ZZjk7k37yfCD0yMZ9M1Gc4E32zGTvTiGXmGjPvLc6YlYrg+dXb//nzKZ56OGT8dPX58WSFO359FoAggG/99DjBIro45TqGvlRIbyIkGw905lBEpDFqnRU777SiMebTwKcBRkZG6h3TopaJ+fQkQqbyJSZmi8wUFvZEaeg7c3O8IwSeaqWINEdbNW7/6cOb2HVkkkwsIBXx3rIhq6Vcgn2/8Hhlh8/LL/i8ssOjUjZ8+YGTAMTilhtuybPumiLrrikwsqKMc9ZLLIamzXUMyYjHYCZKfzK85ByJiMgVOwAsPev+EuDQuU+y1n4R+CLA1q1ba4wXaT+uY1jaFcWbK67GwPhskZNTBcrnDJkxBnqTIcu64qRj/q88VixX2Hdymv2nZqicPxV93s40bMOZqEahiEjT1a1xM8Z8BXg/cMxau6Fer3O2baPdFMvzq23WwpEDLrtf8rj59jzGwF/+aZIHvxkDIJmusGZTkbWbilhbLRCf+dxkPcOvi2w8YEk2SiL0CD1HjZqISPM9Daw2xowCB4HfAD7e3JCaL/QdNi/NkIr45z1WrlhOTOWZmC0SDVySoU88dC9Y03zXYVVfkqFMlF1HJjk5VZh3HI4DPYmQgXSEnniohk1EWkY9r7h9Ffgr4G/q+BqXZO9rLg/93wi7d/rs3ukzMVY94P+vB0+wZFmZ99yVY/N1Ra7aWGR4pPUXEXEczjuTaEx1Y+yueMBwk/d7ExGR81lrS8aYe4AfUN0O4CvW2p1NDqupMjGfjUvShJ5b83HXMfSnIvRf4p5pscDjbSNZjk3keOXIJIXShS+/uY5heU+cJdmopg6ISEuq21/11tpHjDHL6/XzL/y6cOyww+6dPq+95PHqSz53f2aKNZtKHNjrce+X44yuLnHjbXmu3lDk6g1FBoaqY+E3bCnClmKjQ563WOCSjQdkYwGZmE/Ed8mXyuQKFXKlMhHPJRHxcHV2UESkpVlrHwQebHYcraA3GbJxOF3XK1t9qQjZeMCuI5McGc+d93h3ImDNQIpoULtxFBFpBW11Oebll+HD7+plcrx6psxxLctXlpiarN7fdnOe+392jHARLQQVC1z6UiF9qUjN4SOh5xJ6LmnOf0xERKSVJSMeG+rctJ3huw4bhtMMpCNM5kpAdZWYWOhqn1IRWRSa3rgt5KpZIyPwzvfkWLWuxOq1RUavKv1KkxYEV/TjGyLiu2TjPtlY9cqazv6JiEg7OjOnrdGjRHoSofYnFZFFqemN20KumhWPwx/8u9ZeQMSYanPmnDWBLvAM3fGQnmRIQnPSRESkzbmOYfPSDBFfJydFROZLXUKd+J5DInSJBR6J0CMWVG9HfAfT6queiIiI1InjwIbhdM3h/yIicmH13A7gG8AtQI8x5gDwBWvtl+v1es0UC1xSUZ9kpNqkxUNPZxFFRETOYQxsGErTm9RQRRGRS1XPVSU/Vq+f3SyOA4nQJxF6bzZpiYinZYNFREQuwhhYP5Sm7xKX9BcRkSoNlbwAx4FkxCcVqV5JO9OoaZijiIjIpTEG1g2lGEiraRMRuVxq3OacGe6YjvqkYz6JwGvI8sQiIiLt7uqBJIPpaLPDEBFZ1DqycTtzNS0z16Sloz6hpzlpIiIiC21lX4Il2VizwxARWfQ6onFzHEhHfTJze6Olo37D940RERFpJs9xLrhwlsVSrlgq1lKp1P5+YyDwHAqlCnaem/cs644x2hO/zIhFRORsbdm4OQ6kIj7ZeEDXXKOmYY8iItLJVvUlWNWXuOjzKhVL2f6ykbMWfNch8KoLcc0USuw7OcPh8dkLNnm+5zCQirC6P7mQb0FEpKO1VePmu4a3jWTIxAJdURMREbkMjmNwMFxoV5tY4LF2MMXK3gTHJnNYW/0e1xhCzyEeem82eSIisnDaqnHzXIfuhPaGERERqbfAczR3TUSkgXRKTEREREREpMWpcRMREREREWlxatxERERERERanBo3ERERERGRFqfGTUREREREpMWpcRMREREREWlxatxERERERERanBo3ERERERGRFqfGTUREREREpMWpcRMREREREWlxatxERERERERanLHWNjuGNxljjgP7rvDH9AAnFiCcdqO81Ka8nE85qU15qe1y87LMWtu70MG0K9XHulJealNealNealNealvQGtlSjdtCMMY8Y63d2uw4Wo3yUpvycj7lpDblpTblZfHQv1VtykttykttykttykttC50XDZUUERERERFpcWrcREREREREWlw7Nm5fbHYALUp5qU15OZ9yUpvyUpvysnjo36o25aU25aU25aU25aW2Bc1L281xExERERERaTfteMVNRERERESkrSzKxs0Yc4cxZpcx5jVjzGdrPG6MMX859/gOY8yWZsTZaPPIyyfm8rHDGPO4MWZzM+JstIvl5aznXWeMKRtjPtLI+JplPnkxxtxijNlujNlpjHm40TE2wzz+H6WNMQ8YY56fy8unmhFnIxljvmKMOWaMefECj3fkMbdVqUbWphpZm2pkbaqRtalGnq+hNdJau6g+ABf4BbACCIDngXXnPOd9wPcBA9wAPNXsuFskLzcC2bnbdyov5z3vx8CDwEeaHXcr5AXIAC8BI3P3+5odd4vk5XPAf5y73QucAoJmx17nvNwMbAFevMDjHXfMbdUP1cgryotqpGrkpfy+qEaqRp55zw2rkYvxits24DVr7R5rbQG4F7jrnOfcBfyNrXoSyBhjBhsdaINdNC/W2settafn7j4JLGlwjM0wn98XgN8DvgUca2RwTTSfvHwcuM9aux/AWtsJuZlPXiyQNMYYIEG1KJUaG2ZjWWsfofo+L6QTj7mtSjWyNtXI2lQja1ONrE01soZG1sjF2LgNA2+cdf/A3Ncu9Tnt5lLf829S7f7b3UXzYowZBj4E/HUD42q2+fy+XAVkjTE/McY8a4y5u2HRNc988vJXwFrgEPAC8PvW2kpjwmtZnXjMbVWqkbWpRtamGlmbamRtqpGXZ8GOud6ChNNYpsbXzl0acz7PaTfzfs/GmHdTLUrvqGtErWE+efkL4I+steXqCaKOMJ+8eMC1wG1AFHjCGPOktXZ3vYNrovnk5b3AduBWYCXw98aYR621E3WOrZV14jG3ValG1qYaWZtqZG2qkbWpRl6eBTvmLsbG7QCw9Kz7S6h29Zf6nHYzr/dsjNkEfAm401p7skGxNdN88rIVuHeuIPUA7zPGlKy132lIhM0x3/9HJ6y108C0MeYRYDPQzkVpPnn5FPDntjpw/TVjzOvAGuBnjQmxJXXiMbdVqUbWphpZm2pkbaqRtalGXp4FO+YuxqGSTwOrjTGjxpgA+A3g/nOecz9w99wqLjcA49baw40OtMEumhdjzAhwH/DJNj8jdLaL5sVaO2qtXW6tXQ58E/jdNi9IML//R98F3mmM8YwxMeB64OUGx9lo88nLfqpnWDHG9ANXA3saGmXr6cRjbqtSjaxNNbI21cjaVCNrU428PAt2zF10V9ystSVjzD3AD6iubvMVa+1OY8zvzD3+11RXPXof8BowQ7X7b2vzzMufAN3A/5g7c1ay1m5tVsyNMM+8dJz55MVa+7Ix5u+AHUAF+JK1tuZSt+1inr8vfwp81RjzAtXhD39krT3RtKAbwBjzDeAWoMcYcwD4AuBD5x5zW5VqZG2qkbWpRtamGlmbamRtjayRpnolU0RERERERFrVYhwqKSIiIiIi0lHUuImIiIiIiLQ4NW4iIiIiIiItTo2biIiIiIhIi1PjJiIiIiIi0uLUuImIiIiIiLQ4NW4iIiIiIiItTo2bSBMYY64zxuwwxkSMMXFjzE5jzIZmxyUiItJsqpEitWkDbpEmMcb8eyACRIED1to/a3JIIiIiLUE1UuR8atxEmsQYEwBPAzngRmttuckhiYiItATVSJHzaaikSPN0AQkgSfWsooiIiFSpRoqcQ1fcRJrEGHM/cC8wCgxaa+9pckgiIiItQTVS5HxeswMQ6UTGmLuBkrX268YYF3jcGHOrtfbHzY5NRESkmVQjRWrTFTcREREREZEWpzluIiIiIiIiLU6Nm4iIiIiISItT4yYiIiIiItLi1LiJiIiIiIi0ODVuIiIiIiIiLU6Nm4iIiIiISItT4yYiIiIiItLi1LiJiIiIiIi0uP8PjNIXRw6QHiEAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 1080x360 with 2 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "plt.figure(figsize=(15, 5))\n",
     "plt.subplot(1, 2, 1)\n",
@@ -609,7 +413,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.ortho_forest.DROrthoForest at 0x29939f5aac8>"
+       "<econml.orf.DROrthoForest at 0x29939f5aac8>"
       ]
      },
      "execution_count": 16,
@@ -840,7 +644,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.ortho_forest.DROrthoForest at 0x299366477f0>"
+       "<econml.orf.DROrthoForest at 0x299366477f0>"
       ]
      },
      "execution_count": 24,
@@ -1472,7 +1276,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.ortho_forest.DMLOrthoForest at 0x29938f1cc88>"
+       "<econml.orf.DMLOrthoForest at 0x29938f1cc88>"
       ]
      },
      "execution_count": 36,
diff --git a/notebooks/Causal Model Selection with the RScorer.ipynb b/notebooks/Causal Model Selection with the RScorer.ipynb
index 8c7317ac..b8bded33 100644
--- a/notebooks/Causal Model Selection with the RScorer.ipynb	
+++ b/notebooks/Causal Model Selection with the RScorer.ipynb	
@@ -61,7 +61,7 @@
     "# Main imports\n",
     "from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML\n",
     "from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
     "\n",
     "import numpy as np\n",
     "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
diff --git a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb
index 6a69073d..7a703a2a 100644
--- a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb	
+++ b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb	
@@ -77,7 +77,7 @@
     "from xgboost import XGBRegressor, XGBClassifier\n",
     "\n",
     "# EconML imports\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
     "\n",
     "# DoWhy imports \n",
     "import dowhy\n",
@@ -611,7 +611,7 @@
    "source": [
     "test_customers = X.iloc[:1000].values\n",
     "drlearner_estimate = model.estimate_effect(identified_estimand,\n",
-    "                                      method_name=\"backdoor.econml.drlearner.LinearDRLearner\",\n",
+    "                                      method_name=\"backdoor.econml.dr.LinearDRLearner\",\n",
     "                                      target_units = test_customers,\n",
     "                                      treatment_value = 1,\n",
     "                                      method_params={\n",
diff --git a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb
index 38467f5a..91bfaf9a 100644
--- a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb	
+++ b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb	
@@ -63,7 +63,7 @@
     "from xgboost import XGBRegressor, XGBClassifier\n",
     "\n",
     "# EconML imports\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import seaborn as sns\n",
@@ -436,7 +436,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x221d9532108>"
+       "<econml.dr.LinearDRLearner at 0x221d9532108>"
       ]
      },
      "execution_count": 8,
diff --git a/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company - EconML + DoWhy.ipynb b/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company - EconML + DoWhy.ipynb
index 229e1c65..7b91ad6a 100644
--- a/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company - EconML + DoWhy.ipynb	
+++ b/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company - EconML + DoWhy.ipynb	
@@ -94,7 +94,7 @@
     "from dowhy import CausalModel\n",
     "\n",
     "# EconML imports\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
     "from econml.cate_interpreter import SingleTreeCateInterpreter, \\\n",
     "                                    SingleTreePolicyInterpreter\n",
     "\n",
@@ -861,7 +861,7 @@
    "source": [
     "test_customers = X_data.iloc[:1000]\n",
     "driv_estimate = model.estimate_effect(identified_estimand,\n",
-    "                                      method_name=\"iv.econml.ortho_iv.LinearIntentToTreatDRIV\",\n",
+    "                                      method_name=\"iv.econml.iv.dr.LinearIntentToTreatDRIV\",\n",
     "                                      target_units = test_customers,\n",
     "                                      method_params={\n",
     "                                          \"init_params\":{'model_T_XZ': model_T_XZ, \n",
diff --git a/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company.ipynb b/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company.ipynb
index f6445a51..8477343d 100644
--- a/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company.ipynb	
+++ b/notebooks/CustomerScenarios/Case Study - Recommendation AB Testing at An Online Travel Company.ipynb	
@@ -82,7 +82,7 @@
     "from sklearn.preprocessing import PolynomialFeatures\n",
     "\n",
     "# EconML imports\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
     "from econml.cate_interpreter import SingleTreeCateInterpreter, \\\n",
     "                                    SingleTreePolicyInterpreter\n",
     "\n",
@@ -419,7 +419,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.ortho_iv.LinearIntentToTreatDRIV at 0x1b12c1e64c8>"
+       "<econml.iv.dr.LinearIntentToTreatDRIV at 0x1b12c1e64c8>"
       ]
      },
      "execution_count": 7,
diff --git a/notebooks/Deep IV Examples.ipynb b/notebooks/Deep IV Examples.ipynb
index 10b86d17..f45ca81e 100644
--- a/notebooks/Deep IV Examples.ipynb	
+++ b/notebooks/Deep IV Examples.ipynb	
@@ -43,7 +43,7 @@
     "\n",
     "### Using the SDK\n",
     "\n",
-    "In the `econml` package, our Deep IV estimator is built on top of the Keras framework; we support either the Tensorflow or the Theano backends.  There are three steps to using the `DeepIVEstimator`:\n",
+    "In the `econml` package, our Deep IV estimator is built on top of the Keras framework; we support either the Tensorflow or the Theano backends.  There are three steps to using the `DeepIV`:\n",
     "\n",
     "1. Construct an instance.  \n",
     "    * The `m` and `h` arguments to the initializer specify deep neural network models for estimating `T` and `Y` as described above.  They are each *functions* that take two Keras inputs and return a Keras model (the inputs are `z` and `x` in the case of `m` and the output's shape should match `t`'s; the inputs are `t` and `x` in the case of `h` and the output's shape should match `y`'s).  Note that the `h` function will be called multiple times, but should reuse the same weights - see below for a concrete example of how to achieve this using the Keras API.\n",
@@ -71,7 +71,7 @@
     }
    ],
    "source": [
-    "from econml.deepiv import DeepIVEstimator\n",
+    "from econml.iv.nnet import DeepIV\n",
     "import keras\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
@@ -226,7 +226,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we'll instantiate the `DeepIVEstimator` class using these models.  Defining the response model *outside* of the lambda passed into constructor is important, because (depending on the settings for the loss) it can be used multiple times in the second stage and we want the same weights to be used every time."
+    "Now we'll instantiate the `DeepIV` class using these models.  Defining the response model *outside* of the lambda passed into constructor is important, because (depending on the settings for the loss) it can be used multiple times in the second stage and we want the same weights to be used every time."
    ]
   },
   {
@@ -239,15 +239,15 @@
     "                      \"validation_split\": 0.1,\n",
     "                      \"callbacks\": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]}\n",
     "\n",
-    "deepIvEst = DeepIVEstimator(n_components = 10, # number of gaussians in our mixture density network\n",
-    "                            m = lambda z, x : treatment_model(keras.layers.concatenate([z,x])), # treatment model\n",
-    "                            h = lambda t, x : response_model(keras.layers.concatenate([t,x])),  # response model\n",
-    "                            n_samples = 1, # number of samples to use to estimate the response\n",
-    "                            use_upper_bound_loss = False, # whether to use an approximation to the true loss\n",
-    "                            n_gradient_samples = 1, # number of samples to use in second estimate of the response (to make loss estimate unbiased)\n",
-    "                            optimizer='adam', # Keras optimizer to use for training - see https://keras.io/optimizers/ \n",
-    "                            first_stage_options = keras_fit_options, # options for training treatment model\n",
-    "                            second_stage_options = keras_fit_options) # options for training response model"
+    "deepIvEst = DeepIV(n_components = 10, # number of gaussians in our mixture density network\n",
+    "                   m = lambda z, x : treatment_model(keras.layers.concatenate([z,x])), # treatment model\n",
+    "                   h = lambda t, x : response_model(keras.layers.concatenate([t,x])),  # response model\n",
+    "                   n_samples = 1, # number of samples to use to estimate the response\n",
+    "                   use_upper_bound_loss = False, # whether to use an approximation to the true loss\n",
+    "                   n_gradient_samples = 1, # number of samples to use in second estimate of the response (to make loss estimate unbiased)\n",
+    "                   optimizer='adam', # Keras optimizer to use for training - see https://keras.io/optimizers/ \n",
+    "                   first_stage_options = keras_fit_options, # options for training treatment model\n",
+    "                   second_stage_options = keras_fit_options) # options for training response model"
    ]
   },
   {
diff --git a/notebooks/Doubly Robust Learner and Interpretability.ipynb b/notebooks/Doubly Robust Learner and Interpretability.ipynb
index 55fea520..e41ff8e0 100644
--- a/notebooks/Doubly Robust Learner and Interpretability.ipynb	
+++ b/notebooks/Doubly Robust Learner and Interpretability.ipynb	
@@ -86,7 +86,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x2e3934ec710>"
+       "<econml.dr.LinearDRLearner at 0x2e3934ec710>"
       ]
      },
      "execution_count": 3,
@@ -96,7 +96,7 @@
    ],
    "source": [
     "from sklearn.linear_model import LassoCV\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
     "from sklearn.linear_model import LogisticRegressionCV\n",
     "from sklearn.dummy import DummyClassifier\n",
     "\n",
@@ -514,7 +514,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.LinearDRLearner at 0x2e39fe94240>"
+       "<econml.dr.LinearDRLearner at 0x2e39fe94240>"
       ]
      },
      "execution_count": 12,
@@ -524,7 +524,7 @@
    ],
    "source": [
     "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
-    "from econml.drlearner import LinearDRLearner\n",
+    "from econml.dr import LinearDRLearner\n",
     "from sklearn.linear_model import LogisticRegressionCV\n",
     "from sklearn.dummy import DummyClassifier\n",
     "from sklearn.preprocessing import PolynomialFeatures\n",
@@ -791,7 +791,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.SparseLinearDRLearner at 0x2e39fe94d30>"
+       "<econml.dr.SparseLinearDRLearner at 0x2e39fe94d30>"
       ]
      },
      "execution_count": 16,
@@ -801,7 +801,7 @@
    ],
    "source": [
     "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
-    "from econml.drlearner import SparseLinearDRLearner\n",
+    "from econml.dr import SparseLinearDRLearner\n",
     "from sklearn.linear_model import LogisticRegressionCV\n",
     "from sklearn.dummy import DummyClassifier\n",
     "from sklearn.preprocessing import PolynomialFeatures\n",
@@ -1143,7 +1143,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.ForestDRLearner at 0x2e3a0353400>"
+       "<econml.dr.ForestDRLearner at 0x2e3a0353400>"
       ]
      },
      "execution_count": 21,
@@ -1152,7 +1152,7 @@
     }
    ],
    "source": [
-    "from econml.drlearner import ForestDRLearner\n",
+    "from econml.dr import ForestDRLearner\n",
     "from sklearn.ensemble import GradientBoostingRegressor\n",
     "\n",
     "est = ForestDRLearner(model_regression=GradientBoostingRegressor(),\n",
@@ -1690,7 +1690,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.DRLearner at 0x2e3a29294a8>"
+       "<econml.dr.DRLearner at 0x2e3a29294a8>"
       ]
      },
      "execution_count": 38,
@@ -1700,7 +1700,7 @@
    ],
    "source": [
     "# We need to use a scikit-learn final model\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
     "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier\n",
     "\n",
     "# One can replace model_y and model_t with any scikit-learn regressor and classifier correspondingly\n",
diff --git a/notebooks/ForestLearners Basic Example.ipynb b/notebooks/ForestLearners Basic Example.ipynb
index 6158b0a1..038042bc 100644
--- a/notebooks/ForestLearners Basic Example.ipynb	
+++ b/notebooks/ForestLearners Basic Example.ipynb	
@@ -490,7 +490,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.ForestDRLearner at 0x2496cd1b860>"
+       "<econml.dr.ForestDRLearner at 0x2496cd1b860>"
       ]
      },
      "execution_count": 19,
@@ -499,7 +499,7 @@
     }
    ],
    "source": [
-    "from econml.drlearner import ForestDRLearner\n",
+    "from econml.dr import ForestDRLearner\n",
     "from sklearn.dummy import DummyRegressor, DummyClassifier\n",
     "\n",
     "est = ForestDRLearner(model_regression=model_y,\n",
@@ -520,7 +520,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.drlearner.DRLearner at 0x2496cd9c2e8>"
+       "<econml.dr.DRLearner at 0x2496cd9c2e8>"
       ]
      },
      "execution_count": 20,
@@ -529,7 +529,7 @@
     }
    ],
    "source": [
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
     "est2 = DRLearner(model_regression=model_y,\n",
     "                 model_propensity=model_t,\n",
     "                 model_final=final_stage(),\n",
@@ -780,7 +780,7 @@
     {
      "data": {
       "text/plain": [
-       "<econml.ortho_forest.DROrthoForest at 0x24970f68780>"
+       "<econml.orf.DROrthoForest at 0x24970f68780>"
       ]
      },
      "execution_count": 30,
@@ -789,7 +789,7 @@
     }
    ],
    "source": [
-    "from econml.ortho_forest import DROrthoForest\n",
+    "from econml.orf import DROrthoForest\n",
     "from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV\n",
     "from econml.sklearn_extensions.linear_model import WeightedLassoCV\n",
     "\n",
diff --git a/notebooks/Generalized Random Forests.ipynb b/notebooks/Generalized Random Forests.ipynb
index 76ff0612..2510f52b 100644
--- a/notebooks/Generalized Random Forests.ipynb	
+++ b/notebooks/Generalized Random Forests.ipynb	
@@ -23,7 +23,7 @@
     "\n",
     "Causal Forests and [Generalized Random Forests](https://arxiv.org/pdf/1610.01271.pdf) are a flexible method for estimating treatment effect heterogeneity with Random Forests. The `econml.grf` module implements a high-performance Cython version of the [`grf`](https://github.com/grf-labs/grf) R-package, with support for CausalForests, IVForests and RegressionForests. The module provides estimators that adhere to the scikit-learn fit and predict API, as well as providing methods for uncertainty quantification and confidence intervals.\n",
     "\n",
-    "Within the EconML SDK we use these estimators as final models for CATE estimation, such as in the case of `econml.dml.CausalForestDML`, where we combine a Causal Forest with Double Machine Learning, to residualize the treatment and outcome and call the `econml.grf.CausalForest` on the residuals. Similarly, the `econml.drlearner.ForestDRLearner` uses an `econml.grf.RegressionForest` as a final stage estimator on the doubly robust targets estimated by the first stage. The estimators here should primarily be used in conjunction with CateEstimators and not as standalone, but we provide here examples of their direct usage functionality.\n",
+    "Within the EconML SDK we use these estimators as final models for CATE estimation, such as in the case of `econml.dml.CausalForestDML`, where we combine a Causal Forest with Double Machine Learning, to residualize the treatment and outcome and call the `econml.grf.CausalForest` on the residuals. Similarly, the `econml.dr.ForestDRLearner` uses an `econml.grf.RegressionForest` as a final stage estimator on the doubly robust targets estimated by the first stage. The estimators here should primarily be used in conjunction with CateEstimators and not as standalone, but we provide here examples of their direct usage functionality.\n",
     "\n",
     "The EconML SDK implements the following Generalized Random Forest variants:\n",
     "\n",
diff --git a/notebooks/Interpretability with SHAP.ipynb b/notebooks/Interpretability with SHAP.ipynb
index 27685ca8..112105c0 100644
--- a/notebooks/Interpretability with SHAP.ipynb	
+++ b/notebooks/Interpretability with SHAP.ipynb	
@@ -41,9 +41,9 @@
    "outputs": [],
    "source": [
     "from econml.dml import CausalForestDML, LinearDML, NonParamDML\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
     "from econml.metalearners import DomainAdaptationLearner, XLearner\n",
-    "from econml.ortho_iv import LinearIntentToTreatDRIV\n",
+    "from econml.iv.dr import LinearIntentToTreatDRIV\n",
     "import numpy as np\n",
     "import scipy.special\n",
     "import matplotlib.pyplot as plt\n",
diff --git a/notebooks/Metalearners Examples.ipynb b/notebooks/Metalearners Examples.ipynb
index be8507cd..c058c49f 100644
--- a/notebooks/Metalearners Examples.ipynb	
+++ b/notebooks/Metalearners Examples.ipynb	
@@ -258,7 +258,7 @@
    ],
    "source": [
     "# Instantiate Doubly Robust Learner\n",
-    "from econml.drlearner import DRLearner\n",
+    "from econml.dr import DRLearner\n",
     "outcome_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n",
     "pseudo_treatment_model = GradientBoostingRegressor(n_estimators=100, max_depth=6, min_samples_leaf=int(n/100))\n",
     "propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, \n",

	point_estimate	stderr	zstat	ci_lower	ci_upper
0	1.161	0.183	6.339	0.860	1.462
1	1.171	0.177	6.628	0.881	1.462
2	1.182	0.171	6.925	0.901	1.462
3	1.192	0.165	7.228	0.921	1.463
4	1.202	0.160	7.533	0.940	1.465