From d3245908b106008e046a5fbad4bca7072b685154 Mon Sep 17 00:00:00 2001
From: Amit Sharma <amit_sharma@live.com>
Date: Tue, 2 Mar 2021 20:06:00 +0530
Subject: [PATCH] Move matplotlib to an optional dependency (#243)

* removed matplotlib

Signed-off-by: Amit Sharma <amit_sharma@live.com>

* added new req file

Signed-off-by: Amit Sharma <amit_sharma@live.com>
---
 .github/workflows/python-package.yml          |   1 +
 .../add_unobserved_common_cause.py            | 110 +++++++++---------
 .../confounder_distribution_interpreter.py    |   2 +-
 .../propensity_balance_interpreter.py         |   3 +-
 requirements-plotting.txt                     |   1 +
 requirements.txt                              |   1 -
 setup.py                                      |   9 ++
 7 files changed, 71 insertions(+), 56 deletions(-)
 create mode 100644 requirements-plotting.txt

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c93f6061f..3768eac90 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -28,6 +28,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install flake8 pytest twine
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        if [ -f requirements-plotting.txt ]; then pip install -r requirements-plotting.txt; fi
         pip install causalml nbformat jupyter
         pip install econml
     - name: Lint with flake8
diff --git a/dowhy/causal_refuters/add_unobserved_common_cause.py b/dowhy/causal_refuters/add_unobserved_common_cause.py
index 77ff5fd4e..67bdf9376 100755
--- a/dowhy/causal_refuters/add_unobserved_common_cause.py
+++ b/dowhy/causal_refuters/add_unobserved_common_cause.py
@@ -3,8 +3,6 @@ import logging
 import numpy as np
 import pandas as pd
 
-import matplotlib
-import matplotlib.pyplot as plt
 import math
 import statsmodels.api as sm
 
@@ -61,21 +59,21 @@ class AddUnobservedCommonCause(CausalRefuter):
             new_effect = new_estimator.estimate_effect()
             refute = CausalRefutation(self._estimate.value, new_effect.value,
                                     refutation_type="Refute: Add an Unobserved Common Cause")
-            
+
             refute.new_effect = np.array(new_effect.value)
             refute.add_refuter(self)
             return refute
 
         else: # Deal with multiple value inputs
-            
+
             if isinstance(self.kappa_t, np.ndarray) and isinstance(self.kappa_y, np.ndarray): # Deal with range inputs
                 # Get a 2D matrix of values
                 x,y =  np.meshgrid(self.kappa_t, self.kappa_y) # x,y are both MxN
-                
+
                 results_matrix = np.random.rand(len(x),len(y)) # Matrix to hold all the results of NxM
                 print(results_matrix.shape)
                 orig_data = copy.deepcopy(self._data)
-                
+
                 for i in range(0,len(x[0])):
                     for j in range(0,len(y)):
                         new_data = self.include_confounders_effect(orig_data, x[0][i], y[j][0])
@@ -85,10 +83,12 @@ class AddUnobservedCommonCause(CausalRefuter):
                                                 refutation_type="Refute: Add an Unobserved Common Cause")
                         self.logger.debug(refute)
                         results_matrix[i][j] = refute.estimated_effect # Populate the results
-                
+
+                import matplotlib
+                import matplotlib.pyplot as plt
                 fig = plt.figure(figsize=(6,5))
                 left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
-                ax = fig.add_axes([left, bottom, width, height]) 
+                ax = fig.add_axes([left, bottom, width, height])
 
                 cp = plt.contourf(x, y, results_matrix)
                 plt.colorbar(cp)
@@ -115,9 +115,11 @@ class AddUnobservedCommonCause(CausalRefuter):
                     self.logger.debug(refute)
                     outcomes[i] = refute.estimated_effect # Populate the results
 
+                import matplotlib
+                import matplotlib.pyplot as plt
                 fig = plt.figure(figsize=(6,5))
                 left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
-                ax = fig.add_axes([left, bottom, width, height]) 
+                ax = fig.add_axes([left, bottom, width, height])
 
                 plt.plot(self.kappa_t, outcomes)
                 ax.set_title('Effect of Unobserved Common Cause')
@@ -141,7 +143,9 @@ class AddUnobservedCommonCause(CausalRefuter):
                                             refutation_type="Refute: Add an Unobserved Common Cause")
                     self.logger.debug(refute)
                     outcomes[i] = refute.estimated_effect # Populate the results
-                
+
+                import matplotlib
+                import matplotlib.pyplot as plt
                 fig = plt.figure(figsize=(6,5))
                 left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
                 ax = fig.add_axes([left, bottom, width, height])
@@ -150,7 +154,7 @@ class AddUnobservedCommonCause(CausalRefuter):
                 ax.set_title('Effect of Unobserved Common Cause')
                 ax.set_xlabel('Value of Linear Constant on Outcome')
                 ax.set_ylabel('New Effect')
-                plt.show() 
+                plt.show()
 
                 refute.new_effect = outcomes
                 refute.add_refuter(self)
@@ -200,46 +204,46 @@ class AddUnobservedCommonCause(CausalRefuter):
     def include_simulated_confounder(self, convergence_threshold = 0.1, c_star_max = 1000):
         '''
         This function simulates an unobserved confounder based on the data using the following steps:
-            1. It calculates the "residuals"  from the treatment and outcome model 
+            1. It calculates the "residuals"  from the treatment and outcome model
                 i.) The outcome model has outcome as the dependent variable and all the observed variables including treatment as independent variables
                 ii.) The treatment model has treatment as the dependent variable and all the observed variables as independent variables.
 
             2. U is an intermediate random variable drawn from the normal distribution with the weighted average of residuals as mean and a unit variance
                U ~ N(c1*d_y + c2*d_t, 1)
-               where 
+               where
                 *d_y and d_t are residuals from the treatment and outcome model
-                *c1 and c2 are coefficients to the residuals 
-                
+                *c1 and c2 are coefficients to the residuals
+
             3. The final U, which is the simulated unobserved confounder is obtained by debiasing the intermediate variable U by residualising it with X
 
 
         Choosing the coefficients c1 and c2:
         The coefficients are chosen based on these basic assumptions:
             1. There is a hyperbolic relationship satisfying c1*c2 = c_star
-            2. c_star is chosen from a range of possible values based on the correlation of the obtained simulated variable with outcome and treatment.  
+            2. c_star is chosen from a range of possible values based on the correlation of the obtained simulated variable with outcome and treatment.
             3. The product of correlations with treatment and outcome should be at a minimum distance to the maximum correlations with treatment and outcome in any of the observed confounders
-            4. The ratio of the weights should be such that they maintain the ratio of the maximum possible observed coefficients within some confidence interval 
+            4. The ratio of the weights should be such that they maintain the ratio of the maximum possible observed coefficients within some confidence interval
 
-        :param c_star_max: The maximum possible value for the hyperbolic curve on which the coefficients to the residuals lie. It defaults to 1000 in the code if not specified by the user. 
+        :param c_star_max: The maximum possible value for the hyperbolic curve on which the coefficients to the residuals lie. It defaults to 1000 in the code if not specified by the user.
         :type int
         :param convergence_threshold: The threshold to check the plateauing of the correlation while selecting a c_star. It defaults to 0.1 in the code if not specified by the user
         :type float
-        :returns final_U: The simulated values of the unobserved confounder based on the data 
+        :returns final_U: The simulated values of the unobserved confounder based on the data
         :type pandas.core.series.Series
 
         '''
 
 
-        #Obtaining the list of observed variables 
+        #Obtaining the list of observed variables
         required_variables = True
         observed_variables = self.choose_variables(required_variables)
 
         observed_variables_with_treatment_and_outcome = observed_variables + self._treatment_name + self._outcome_name
 
-        #Taking a subset of the dataframe that has only observed variables 
+        #Taking a subset of the dataframe that has only observed variables
         self._data = self._data[observed_variables_with_treatment_and_outcome]
 
-        #Residuals from the outcome model obtained by fitting a linear model 
+        #Residuals from the outcome model obtained by fitting a linear model
         y = self._data[self._outcome_name[0]]
         observed_variables_with_treatment = observed_variables + self._treatment_name
         X = self._data[observed_variables_with_treatment]
@@ -249,7 +253,7 @@ class AddUnobservedCommonCause(CausalRefuter):
         d_y = list(pd.Series(residuals_y))
 
 
-        #Residuals from the treatment model obtained by fitting a linear model 
+        #Residuals from the treatment model obtained by fitting a linear model
         t = self._data[self._treatment_name[0]].astype('int64')
         X = self._data[observed_variables]
         model = sm.OLS(t,X)
@@ -258,7 +262,7 @@ class AddUnobservedCommonCause(CausalRefuter):
         d_t = list(pd.Series(residuals_t))
 
 
-        #Initialising product_cor_metric_observed with a really low value as finding maximum 
+        #Initialising product_cor_metric_observed with a really low value as finding maximum
         product_cor_metric_observed = -10000000000
 
         for i in observed_variables:
@@ -273,7 +277,7 @@ class AddUnobservedCommonCause(CausalRefuter):
                 correlation_t_observed = correlation_t
                 correlation_y_observed = correlation_y
 
-        
+
         #The user has an option to give the the effect_strength_on_y and effect_strength_on_t which can be then used instead of maximum correlation with treatment and outcome in the observed variables as it specifies the desired effect.
         if self.kappa_t is not None:
             correlation_t_observed = self.kappa_t
@@ -281,12 +285,12 @@ class AddUnobservedCommonCause(CausalRefuter):
             correlation_y_observed = self.kappa_y
 
 
-        #Choosing a c_star based on the data. 
+        #Choosing a c_star based on the data.
         #The correlations stop increasing upon increasing c_star after a certain value, that is it plateaus and we choose the value of c_star to be the value it plateaus.
-        
+
         correlation_y_list = []
         correlation_t_list = []
-        product_cor_metric_simulated_list = []    
+        product_cor_metric_simulated_list = []
         x_list = []
 
 
@@ -295,7 +299,7 @@ class AddUnobservedCommonCause(CausalRefuter):
             c1 = math.sqrt(i)
             c2 = c1
             final_U = self.generate_confounder_from_residuals(c1, c2, d_y, d_t, X)
-            current_simulated_confounder = final_U 
+            current_simulated_confounder = final_U
             outcome_values = self._data[self._outcome_name[0]]
             correlation_y = current_simulated_confounder.corr(outcome_values)
             correlation_y_list.append(correlation_y)
@@ -306,11 +310,11 @@ class AddUnobservedCommonCause(CausalRefuter):
 
             product_cor_metric_simulated = correlation_y*correlation_t
             product_cor_metric_simulated_list.append(product_cor_metric_simulated)
-            
+
 
             x_list.append(i)
 
-        
+
         index = 1
         while index<len(correlation_y_list):
             if (correlation_y_list[index]-correlation_y_list[index-1])<=convergence_threshold:
@@ -318,18 +322,18 @@ class AddUnobservedCommonCause(CausalRefuter):
                 break
             index = index+1
 
-        #Choosing c1 and c2 based on the hyperbolic relationship once c_star is chosen by going over various combinations of c1 and c2 values and choosing the combination which 
-        #which maintains the minimum distance between the product of correlations of the simulated variable and the product of maximum correlations of one of the observed variables 
-        # and additionally checks if the ratio of the weights are such that they maintain the ratio of the maximum possible observed coefficients within some confidence interval 
+        #Choosing c1 and c2 based on the hyperbolic relationship once c_star is chosen by going over various combinations of c1 and c2 values and choosing the combination which
+        #which maintains the minimum distance between the product of correlations of the simulated variable and the product of maximum correlations of one of the observed variables
+        # and additionally checks if the ratio of the weights are such that they maintain the ratio of the maximum possible observed coefficients within some confidence interval
 
 
         #c1_final and c2_final are initialised to the values on the hyperbolic curve such that c1_final = c2_final  and c1_final*c2_final = c_star
         c1_final = math.sqrt(c_star)
         c2_final = math.sqrt(c_star)
 
-        
+
         #initialising min_distance_between_product_cor_metrics to be a value greater than 1
-        min_distance_between_product_cor_metrics = 1.5 
+        min_distance_between_product_cor_metrics = 1.5
         i = 0.05
 
         threshold = c_star/0.05
@@ -339,7 +343,7 @@ class AddUnobservedCommonCause(CausalRefuter):
             c1 = c_star/c2
             final_U = self.generate_confounder_from_residuals(c1, c2, d_y, d_t, X)
 
-            current_simulated_confounder = final_U 
+            current_simulated_confounder = final_U
             outcome_values = self._data[self._outcome_name[0]]
             correlation_y = current_simulated_confounder.corr(outcome_values)
 
@@ -347,14 +351,14 @@ class AddUnobservedCommonCause(CausalRefuter):
             correlation_t = current_simulated_confounder.corr(treatment_values)
 
             product_cor_metric_simulated = correlation_y*correlation_t
-            
+
             if min_distance_between_product_cor_metrics>=abs(product_cor_metric_simulated - product_cor_metric_observed):
                 min_distance_between_product_cor_metrics = abs(product_cor_metric_simulated - product_cor_metric_observed)
                 additional_condition = (correlation_y_observed/correlation_t_observed)
-                if ((c1/c2) <= (additional_condition + 0.3*additional_condition)) and ((c1/c2) >= (additional_condition - 0.3*additional_condition)): #choose minimum positive value 
+                if ((c1/c2) <= (additional_condition + 0.3*additional_condition)) and ((c1/c2) >= (additional_condition - 0.3*additional_condition)): #choose minimum positive value
                     c1_final = c1
                     c2_final = c2
-                
+
             i = i*1.5
 
         '''#closed form solution
@@ -371,7 +375,7 @@ class AddUnobservedCommonCause(CausalRefuter):
             c1 = c_star_max/c2'''
 
         final_U = self.generate_confounder_from_residuals(c1_final, c2_final, d_y, d_t, X)
-        
+
         return final_U
 
 
@@ -381,13 +385,13 @@ class AddUnobservedCommonCause(CausalRefuter):
         the row wise normal distribution corresponding to each residual value and then debiasing the intermediate variable to get the final variable
 
         :param c1: coefficient to the residual from the outcome model
-        :type float 
-        :param c2: coefficient to the residual from the treatment model 
-        :type float 
-        :param d_y: residuals from the outcome model 
+        :type float
+        :param c2: coefficient to the residual from the treatment model
+        :type float
+        :param d_y: residuals from the outcome model
+        :type list
+        :param d_t: residuals from the treatment model
         :type list
-        :param d_t: residuals from the treatment model 
-        :type list 
 
         :returns final_U: The simulated values of the unobserved confounder based on the data
         :type pandas.core.series.Series
@@ -408,12 +412,12 @@ class AddUnobservedCommonCause(CausalRefuter):
         final_U = pd.Series(U)
 
         return final_U
-        
-        
-
-        
 
 
 
-        
-        
\ No newline at end of file
+
+
+
+
+
+
diff --git a/dowhy/interpreters/confounder_distribution_interpreter.py b/dowhy/interpreters/confounder_distribution_interpreter.py
index 4ec7b662a..a7ddde66b 100644
--- a/dowhy/interpreters/confounder_distribution_interpreter.py
+++ b/dowhy/interpreters/confounder_distribution_interpreter.py
@@ -1,5 +1,4 @@
 import numpy as np
-import matplotlib.pyplot as plt
 
 from dowhy.interpreters.visual_interpreter import VisualInterpreter
 from dowhy.causal_estimators.propensity_score_weighting_estimator import PropensityScoreWeightingEstimator
@@ -83,6 +82,7 @@ class ConfounderDistributionInterpreter(VisualInterpreter):
         title1 = "Distribution of " + self.var_name + " before applying the weights"
         title2 = "Distribution of " + self.var_name + " after applying the weights"
 
+        import matplotlib.pyplot as plt
         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=self.fig_size)
         iterable = zip([barplot_df_before, barplot_df_after], [ax1, ax2], [title1, title2])
         for plot_df, ax, title in iterable:
diff --git a/dowhy/interpreters/propensity_balance_interpreter.py b/dowhy/interpreters/propensity_balance_interpreter.py
index c47c9f570..fe19a3f0b 100644
--- a/dowhy/interpreters/propensity_balance_interpreter.py
+++ b/dowhy/interpreters/propensity_balance_interpreter.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 
 from dowhy.interpreters.visual_interpreter import VisualInterpreter
 from dowhy.causal_estimators.propensity_score_estimator import PropensityScoreEstimator
@@ -61,6 +60,8 @@ class PropensityBalanceInterpreter(VisualInterpreter):
         mean_diff_strata["sample"]="PropensityAdjusted"
         mean_diff_overall["sample"] = "Unadjusted"
         plot_df = pd.concat([mean_diff_overall, mean_diff_strata])
+
+        import matplotlib.pyplot as plt
         plt.style.use("seaborn-white")
         fig, ax = plt.subplots(1,1)
         for label, subdf in plot_df.groupby('common_cause_id'):
diff --git a/requirements-plotting.txt b/requirements-plotting.txt
new file mode 100644
index 000000000..6ccafc3f9
--- /dev/null
+++ b/requirements-plotting.txt
@@ -0,0 +1 @@
+matplotlib
diff --git a/requirements.txt b/requirements.txt
index e39820b86..d539460a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ scipy
 statsmodels
 pandas>=0.24
 networkx>=2.0
-matplotlib
 sympy>=1.4
 scikit-learn
 pydot>=1.4
diff --git a/setup.py b/setup.py
index 145fd46f1..2b3379f98 100755
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,14 @@ with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
 with open(path.join(here, 'requirements.txt'), encoding='utf-8') as f:
     install_requires = f.read().splitlines()
 
+# Plotting packages are optional to install
+extras = ["plotting"]
+extras_require = dict()
+for e in extras:
+    req_file = "requirements-{0}.txt".format(e)
+    with open(req_file) as f:
+        extras_require[e] = [line.strip() for line in f]
+
 # Loading version number
 with open(path.join(here, 'dowhy', 'VERSION')) as version_file:
     version = version_file.read().strip()
@@ -45,6 +53,7 @@ setup(
     packages=find_packages(exclude=['docs', 'tests']),
     python_requires='>=3.5',
     install_requires=install_requires,
+    extras_require=extras_require,
     include_package_data=True,
     package_data={'dowhy':['VERSION']}
 )