New doc merge tool and stub bundle builder (#142)
* New docify tool that can use introspection of real packages to get docstrings for insertion * Removed the old doc files; we don't need them anymore. * Added a config file and script to build bundled stubs for pylance. Note this is not working quite right yet because we have a mismatch between stubs here and the stubs bundled in pyright (several of those are partial stubs, so will need to move them here).
This commit is contained in:
Родитель
155e51a8cd
Коммит
518bade887
|
@ -357,6 +357,10 @@ MigrationBackup/
|
|||
.venv
|
||||
|
||||
# Build files from utils
|
||||
utils/stubsplit/.eggs
|
||||
utils/stubsplit/build
|
||||
utils/stubsplit/stubsplit.egg-info
|
||||
.eggs
|
||||
*.egg-info
|
||||
utils/*/build
|
||||
utils/build_bundle/stubs
|
||||
AUTHORS
|
||||
ChangeLog
|
||||
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
This directory contains docstrings that we inline into type stubs before bundling the stubs in Pylance. This is only for cases where Pylance cannot otherwise locate the docstrings based on its own heuristics (perhaps because the docstrings are programmatically generated). The stubsplit.py utility in the utils folder is used for merging or splitting such docstrings into or out of stub files.
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
def factorize(
|
||||
values: Any, sort: bool = ..., na_sentinel: int = ..., size_hint: Union[int, None] = None,
|
||||
) -> Tuple[np.ndarray, Union[np.ndarray, Index]]:
|
||||
"""Encode the object as an enumerated type or categorical variable.
|
||||
|
||||
This method is useful for obtaining a numeric representation of an
|
||||
array when all that matters is identifying distinct values. `factorize`
|
||||
is available as both a top-level function :func:`pandas.factorize`,
|
||||
and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : sequence
|
||||
A 1-D sequence. Sequences that aren't pandas objects are
|
||||
coerced to ndarrays before factorization.
|
||||
sort : bool, default False
|
||||
Sort `uniques` and shuffle `codes` to maintain the
|
||||
relationship.
|
||||
|
||||
na_sentinel : int, default -1
|
||||
Value to mark "not found".
|
||||
size_hint : int, optional
|
||||
Hint to the hashtable sizer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
codes : ndarray
|
||||
An integer ndarray that's an indexer into `uniques`.
|
||||
``uniques.take(codes)`` will have the same values as `values`.
|
||||
uniques : ndarray, Index, or Categorical
|
||||
The unique valid values. When `values` is Categorical, `uniques`
|
||||
is a Categorical. When `values` is some other pandas object, an
|
||||
`Index` is returned. Otherwise, a 1-D ndarray is returned.
|
||||
|
||||
.. note ::
|
||||
|
||||
Even if there's a missing value in `values`, `uniques` will
|
||||
*not* contain an entry for it.
|
||||
|
||||
See Also
|
||||
--------
|
||||
cut : Discretize continuous-valued array.
|
||||
unique : Find the unique value in an array.
|
||||
|
||||
Examples
|
||||
--------
|
||||
These examples all show factorize as a top-level method like
|
||||
``pd.factorize(values)``. The results are identical for methods like
|
||||
:meth:`Series.factorize`.
|
||||
|
||||
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
|
||||
>>> codes
|
||||
array([0, 0, 1, 2, 0])
|
||||
>>> uniques
|
||||
array(['b', 'a', 'c'], dtype=object)
|
||||
|
||||
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
|
||||
shuffled so that the relationship is the maintained.
|
||||
|
||||
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
|
||||
>>> codes
|
||||
array([1, 1, 0, 2, 1])
|
||||
>>> uniques
|
||||
array(['a', 'b', 'c'], dtype=object)
|
||||
|
||||
Missing values are indicated in `codes` with `na_sentinel`
|
||||
(``-1`` by default). Note that missing values are never
|
||||
included in `uniques`.
|
||||
|
||||
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
|
||||
>>> codes
|
||||
array([ 0, -1, 1, 2, 0])
|
||||
>>> uniques
|
||||
array(['b', 'a', 'c'], dtype=object)
|
||||
|
||||
Thus far, we've only factorized lists (which are internally coerced to
|
||||
NumPy arrays). When factorizing pandas objects, the type of `uniques`
|
||||
will differ. For Categoricals, a `Categorical` is returned.
|
||||
|
||||
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
|
||||
>>> codes, uniques = pd.factorize(cat)
|
||||
>>> codes
|
||||
array([0, 0, 1])
|
||||
>>> uniques
|
||||
[a, c]
|
||||
Categories (3, object): [a, b, c]
|
||||
|
||||
Notice that ``'b'`` is in ``uniques.categories``, despite not being
|
||||
present in ``cat.values``.
|
||||
|
||||
For all other pandas objects, an Index of the appropriate type is
|
||||
returned.
|
||||
|
||||
>>> cat = pd.Series(['a', 'a', 'c'])
|
||||
>>> codes, uniques = pd.factorize(cat)
|
||||
>>> codes
|
||||
array([0, 0, 1])
|
||||
>>> uniques
|
||||
Index(['a', 'c'], dtype='object')
|
||||
"""
|
||||
pass
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,730 +0,0 @@
|
|||
|
||||
class DataFrameGroupBy(GroupBy):
|
||||
def aggregate(self, arg: str, *args, **kwargs) -> DataFrame:
|
||||
"""Aggregate using one or more operations over the specified axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function, str, list or dict
|
||||
Function to use for aggregating the data. If a function, must either
|
||||
work when passed a DataFrame or when passed to DataFrame.apply.
|
||||
|
||||
Accepted combinations are:
|
||||
|
||||
- function
|
||||
- string function name
|
||||
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
||||
- dict of axis labels -> functions, function names or list of such.
|
||||
|
||||
*args
|
||||
Positional arguments to pass to `func`.
|
||||
**kwargs
|
||||
Keyword arguments to pass to `func`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scalar, Series or DataFrame
|
||||
|
||||
The return can be:
|
||||
|
||||
* scalar : when Series.agg is called with single function
|
||||
* Series : when DataFrame.agg is called with a single function
|
||||
* DataFrame : when DataFrame.agg is called with several functions
|
||||
|
||||
Return scalar, Series or DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas.DataFrame.groupby.apply
|
||||
pandas.DataFrame.groupby.transform
|
||||
pandas.DataFrame.aggregate
|
||||
|
||||
Notes
|
||||
-----
|
||||
`agg` is an alias for `aggregate`. Use the alias.
|
||||
|
||||
A passed user-defined-function will be passed a Series for evaluation.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> df = pd.DataFrame({'A': [1, 1, 2, 2],
|
||||
... 'B': [1, 2, 3, 4],
|
||||
... 'C': np.random.randn(4)})
|
||||
|
||||
>>> df
|
||||
A B C
|
||||
0 1 1 0.362838
|
||||
1 1 2 0.227877
|
||||
2 2 3 1.267767
|
||||
3 2 4 -0.562860
|
||||
|
||||
The aggregation is for each column.
|
||||
|
||||
>>> df.groupby('A').agg('min')
|
||||
B C
|
||||
A
|
||||
1 1 0.227877
|
||||
2 3 -0.562860
|
||||
|
||||
Multiple aggregations
|
||||
|
||||
>>> df.groupby('A').agg(['min', 'max'])
|
||||
B C
|
||||
min max min max
|
||||
A
|
||||
1 1 2 0.227877 0.362838
|
||||
2 3 4 -0.562860 1.267767
|
||||
|
||||
Select a column for aggregation
|
||||
|
||||
>>> df.groupby('A').B.agg(['min', 'max'])
|
||||
min max
|
||||
A
|
||||
1 1 2
|
||||
2 3 4
|
||||
|
||||
Different aggregations per column
|
||||
|
||||
>>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
|
||||
B C
|
||||
min max sum
|
||||
A
|
||||
1 1 2 0.590716
|
||||
2 3 4 0.704907
|
||||
|
||||
To control the output names with different aggregations per column,
|
||||
pandas supports "named aggregation"
|
||||
|
||||
>>> df.groupby("A").agg(
|
||||
... b_min=pd.NamedAgg(column="B", aggfunc="min"),
|
||||
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
|
||||
b_min c_sum
|
||||
A
|
||||
1 1 -1.956929
|
||||
2 3 -0.322183
|
||||
|
||||
- The keywords are the *output* column names
|
||||
- The values are tuples whose first element is the column to select
|
||||
and the second element is the aggregation to apply to that column.
|
||||
Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
|
||||
``['column', 'aggfunc']`` to make it clearer what the arguments are.
|
||||
As usual, the aggregation can be a callable or a string alias.
|
||||
|
||||
See :ref:`groupby.aggregate.named` for more.
|
||||
"""
|
||||
pass
|
||||
def filter(self, func: Callable, dropna: bool = ..., *args, **kwargs) -> DataFrame:
|
||||
"""Return a copy of a DataFrame excluding elements from groups that
|
||||
do not satisfy the boolean criterion specified by func.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f : function
|
||||
Function to apply to each subframe. Should return True or False.
|
||||
dropna : Drop groups that do not pass the filter. True by default;
|
||||
If False, groups that evaluate False are filled with NaNs.
|
||||
|
||||
Returns
|
||||
-------
|
||||
filtered : DataFrame
|
||||
|
||||
Notes
|
||||
-----
|
||||
Each subframe is endowed the attribute 'name' in case you need to know
|
||||
which group you are working on.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
|
||||
... 'foo', 'bar'],
|
||||
... 'B' : [1, 2, 3, 4, 5, 6],
|
||||
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
|
||||
>>> grouped = df.groupby('A')
|
||||
>>> grouped.filter(lambda x: x['B'].mean() > 3.)
|
||||
A B C
|
||||
1 bar 2 5.0
|
||||
3 bar 4 1.0
|
||||
5 bar 6 9.0
|
||||
"""
|
||||
pass
|
||||
def count(self) -> DataFrame:
|
||||
"""Compute count of group, excluding missing values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Count of values within each group.
|
||||
"""
|
||||
pass
|
||||
def nunique(self, dropna: bool = ...) -> DataFrame:
|
||||
"""
|
||||
Return DataFrame with number of distinct observations per group for
|
||||
each column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dropna : bool, default True
|
||||
Don't include NaN in the counts.
|
||||
|
||||
Returns
|
||||
-------
|
||||
nunique: DataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
|
||||
... 'ham', 'ham'],
|
||||
... 'value1': [1, 5, 5, 2, 5, 5],
|
||||
... 'value2': list('abbaxy')})
|
||||
>>> df
|
||||
id value1 value2
|
||||
0 spam 1 a
|
||||
1 egg 5 b
|
||||
2 egg 5 b
|
||||
3 spam 2 a
|
||||
4 ham 5 x
|
||||
5 ham 5 y
|
||||
|
||||
>>> df.groupby('id').nunique()
|
||||
id value1 value2
|
||||
id
|
||||
egg 1 1 1
|
||||
ham 1 1 2
|
||||
spam 1 2 1
|
||||
|
||||
Check for rows with the same id but conflicting values:
|
||||
|
||||
>>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
|
||||
id value1 value2
|
||||
0 spam 1 a
|
||||
3 spam 2 a
|
||||
4 ham 5 x
|
||||
5 ham 5 y
|
||||
"""
|
||||
pass
|
||||
def boxplot(
|
||||
self,
|
||||
grouped: DataFrame,
|
||||
subplots: bool = ...,
|
||||
column: Optional[Union[str, Sequence]] = ...,
|
||||
fontsize: Union[int, str] = ...,
|
||||
rot: float = ...,
|
||||
grid: bool = ...,
|
||||
ax: Optional[PlotAxes] = ...,
|
||||
figsize: Optional[Tuple[float, float]] = ...,
|
||||
layout: Optional[Tuple[int, int]] = ...,
|
||||
sharex: bool = ...,
|
||||
sharey: bool = ...,
|
||||
bins: Union[int, Sequence] = ...,
|
||||
backend: Optional[str] = ...,
|
||||
**kwargs
|
||||
) -> Union[AxesSubplot, Sequence[AxesSubplot]]:
|
||||
"""Make box plots from DataFrameGroupBy data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
grouped : Grouped DataFrame
|
||||
subplots : bool
|
||||
* ``False`` - no subplots will be used
|
||||
* ``True`` - create a subplot for each group.
|
||||
|
||||
column : column name or list of names, or vector
|
||||
Can be any valid input to groupby.
|
||||
fontsize : int or str
|
||||
rot : label rotation angle
|
||||
grid : Setting this to True will show the grid
|
||||
ax : Matplotlib axis object, default None
|
||||
figsize : A tuple (width, height) in inches
|
||||
layout : tuple (optional)
|
||||
The layout of the plot: (rows, columns).
|
||||
sharex : bool, default False
|
||||
Whether x-axes will be shared among subplots.
|
||||
|
||||
.. versionadded:: 0.23.1
|
||||
sharey : bool, default True
|
||||
Whether y-axes will be shared among subplots.
|
||||
|
||||
.. versionadded:: 0.23.1
|
||||
backend : str, default None
|
||||
Backend to use instead of the backend specified in the option
|
||||
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
|
||||
specify the ``plotting.backend`` for the whole session, set
|
||||
``pd.options.plotting.backend``.
|
||||
|
||||
.. versionadded:: 1.0.0
|
||||
|
||||
**kwargs
|
||||
All other plotting keyword arguments to be passed to
|
||||
matplotlib's boxplot function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict of key/value = group key/DataFrame.boxplot return value
|
||||
or DataFrame.boxplot return value in case subplots=figures=False
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import itertools
|
||||
>>> tuples = [t for t in itertools.product(range(1000), range(4))]
|
||||
>>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
|
||||
>>> data = np.random.randn(len(index),4)
|
||||
>>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
|
||||
>>>
|
||||
>>> grouped = df.groupby(level='lvl1')
|
||||
>>> boxplot_frame_groupby(grouped)
|
||||
>>>
|
||||
>>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1)
|
||||
>>> boxplot_frame_groupby(grouped, subplots=False)
|
||||
"""
|
||||
pass
|
||||
def corrwith(self, other: DataFrame, axis: AxisType = ..., drop: bool = ..., method: str = ...,) -> Series:
|
||||
"""Compute pairwise correlation.
|
||||
|
||||
Pairwise correlation is computed between rows or columns of
|
||||
DataFrame with rows or columns of Series or DataFrame. DataFrames
|
||||
are first aligned along both axes before computing the
|
||||
correlations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : DataFrame, Series
|
||||
Object with which to compute correlations.
|
||||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||||
The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
|
||||
row-wise.
|
||||
drop : bool, default False
|
||||
Drop missing indices from result.
|
||||
method : {'pearson', 'kendall', 'spearman'} or callable
|
||||
Method of correlation:
|
||||
|
||||
* pearson : standard correlation coefficient
|
||||
* kendall : Kendall Tau correlation coefficient
|
||||
* spearman : Spearman rank correlation
|
||||
* callable: callable with input two 1d ndarrays
|
||||
and returning a float.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Pairwise correlations.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.corr
|
||||
"""
|
||||
pass
|
||||
def fillna(
|
||||
self,
|
||||
value,
|
||||
method: Optional[str] = ...,
|
||||
axis: AxisType = ...,
|
||||
limit: Optional[int] = ...,
|
||||
downcast: Optional[Dict] = ...,
|
||||
*,
|
||||
inplace: Literal[True]
|
||||
) -> None:
|
||||
"""Fill NA/NaN values using the specified method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : scalar, dict, Series, or DataFrame
|
||||
Value to use to fill holes (e.g. 0), alternately a
|
||||
dict/Series/DataFrame of values specifying which value to use for
|
||||
each index (for a Series) or column (for a DataFrame). Values not
|
||||
in the dict/Series/DataFrame will not be filled. This value cannot
|
||||
be a list.
|
||||
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
|
||||
Method to use for filling holes in reindexed Series
|
||||
pad / ffill: propagate last valid observation forward to next valid
|
||||
backfill / bfill: use next valid observation to fill gap.
|
||||
axis : {0 or 'index', 1 or 'columns'}
|
||||
Axis along which to fill missing values.
|
||||
inplace : bool, default False
|
||||
If True, fill in-place. Note: this will modify any
|
||||
other views on this object (e.g., a no-copy slice for a column in a
|
||||
DataFrame).
|
||||
limit : int, default None
|
||||
If method is specified, this is the maximum number of consecutive
|
||||
NaN values to forward/backward fill. In other words, if there is
|
||||
a gap with more than this number of consecutive NaNs, it will only
|
||||
be partially filled. If method is not specified, this is the
|
||||
maximum number of entries along the entire axis where NaNs will be
|
||||
filled. Must be greater than 0 if not None.
|
||||
downcast : dict, default is None
|
||||
A dict of item->dtype of what to downcast if possible,
|
||||
or the string 'infer' which will try to downcast to an appropriate
|
||||
equal type (e.g. float64 to int64 if possible).
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or None
|
||||
Object with missing values filled or None if ``inplace=True``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
interpolate : Fill NaN values using interpolation.
|
||||
reindex : Conform object to new index.
|
||||
asfreq : Convert TimeSeries to specified frequency.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
|
||||
... [3, 4, np.nan, 1],
|
||||
... [np.nan, np.nan, np.nan, 5],
|
||||
... [np.nan, 3, np.nan, 4]],
|
||||
... columns=list('ABCD'))
|
||||
>>> df
|
||||
A B C D
|
||||
0 NaN 2.0 NaN 0
|
||||
1 3.0 4.0 NaN 1
|
||||
2 NaN NaN NaN 5
|
||||
3 NaN 3.0 NaN 4
|
||||
|
||||
Replace all NaN elements with 0s.
|
||||
|
||||
>>> df.fillna(0)
|
||||
A B C D
|
||||
0 0.0 2.0 0.0 0
|
||||
1 3.0 4.0 0.0 1
|
||||
2 0.0 0.0 0.0 5
|
||||
3 0.0 3.0 0.0 4
|
||||
|
||||
We can also propagate non-null values forward or backward.
|
||||
|
||||
>>> df.fillna(method='ffill')
|
||||
A B C D
|
||||
0 NaN 2.0 NaN 0
|
||||
1 3.0 4.0 NaN 1
|
||||
2 3.0 4.0 NaN 5
|
||||
3 3.0 3.0 NaN 4
|
||||
|
||||
Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
|
||||
2, and 3 respectively.
|
||||
|
||||
>>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
|
||||
>>> df.fillna(value=values)
|
||||
A B C D
|
||||
0 0.0 2.0 2.0 0
|
||||
1 3.0 4.0 2.0 1
|
||||
2 0.0 1.0 2.0 5
|
||||
3 0.0 3.0 2.0 4
|
||||
|
||||
Only replace the first NaN element.
|
||||
|
||||
>>> df.fillna(value=values, limit=1)
|
||||
A B C D
|
||||
0 0.0 2.0 2.0 0
|
||||
1 3.0 4.0 NaN 1
|
||||
2 NaN 1.0 NaN 5
|
||||
3 NaN 3.0 NaN 4
|
||||
"""
|
||||
pass
|
||||
def hist(
|
||||
self,
|
||||
data: DataFrame,
|
||||
column: Optional[Union[str, Sequence]] = ...,
|
||||
by = ...,
|
||||
grid: bool = ...,
|
||||
xlabelsize: Optional[int] = ...,
|
||||
xrot: Optional[float] = ...,
|
||||
ylabelsize: Optional[int] = ...,
|
||||
yrot: Optional[float] = ...,
|
||||
ax: Optional[PlotAxes] = ...,
|
||||
sharex: bool = ...,
|
||||
sharey: bool = ...,
|
||||
figsize: Optional[Tuple[float, float]] = ...,
|
||||
layout: Optional[Tuple[int, int]] = ...,
|
||||
bins: Union[int, Sequence] = ...,
|
||||
backend: Optional[str] = ...,
|
||||
**kwargs
|
||||
) -> Union[AxesSubplot, Sequence[AxesSubplot]]:
|
||||
"""Make a histogram of the DataFrame's.
|
||||
|
||||
A `histogram`_ is a representation of the distribution of data.
|
||||
This function calls :meth:`matplotlib.pyplot.hist`, on each series in
|
||||
the DataFrame, resulting in one histogram per column.
|
||||
|
||||
.. _histogram: https://en.wikipedia.org/wiki/Histogram
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
The pandas object holding the data.
|
||||
column : str or sequence
|
||||
If passed, will be used to limit data to a subset of columns.
|
||||
by : object, optional
|
||||
If passed, then used to form histograms for separate groups.
|
||||
grid : bool, default True
|
||||
Whether to show axis grid lines.
|
||||
xlabelsize : int, default None
|
||||
If specified changes the x-axis label size.
|
||||
xrot : float, default None
|
||||
Rotation of x axis labels. For example, a value of 90 displays the
|
||||
x labels rotated 90 degrees clockwise.
|
||||
ylabelsize : int, default None
|
||||
If specified changes the y-axis label size.
|
||||
yrot : float, default None
|
||||
Rotation of y axis labels. For example, a value of 90 displays the
|
||||
y labels rotated 90 degrees clockwise.
|
||||
ax : Matplotlib axes object, default None
|
||||
The axes to plot the histogram on.
|
||||
sharex : bool, default True if ax is None else False
|
||||
In case subplots=True, share x axis and set some x axis labels to
|
||||
invisible; defaults to True if ax is None otherwise False if an ax
|
||||
is passed in.
|
||||
Note that passing in both an ax and sharex=True will alter all x axis
|
||||
labels for all subplots in a figure.
|
||||
sharey : bool, default False
|
||||
In case subplots=True, share y axis and set some y axis labels to
|
||||
invisible.
|
||||
figsize : tuple
|
||||
The size in inches of the figure to create. Uses the value in
|
||||
`matplotlib.rcParams` by default.
|
||||
layout : tuple, optional
|
||||
Tuple of (rows, columns) for the layout of the histograms.
|
||||
bins : int or sequence, default 10
|
||||
Number of histogram bins to be used. If an integer is given, bins + 1
|
||||
bin edges are calculated and returned. If bins is a sequence, gives
|
||||
bin edges, including left edge of first bin and right edge of last
|
||||
bin. In this case, bins is returned unmodified.
|
||||
backend : str, default None
|
||||
Backend to use instead of the backend specified in the option
|
||||
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
|
||||
specify the ``plotting.backend`` for the whole session, set
|
||||
``pd.options.plotting.backend``.
|
||||
|
||||
.. versionadded:: 1.0.0
|
||||
|
||||
**kwargs
|
||||
All other plotting keyword arguments to be passed to
|
||||
:meth:`matplotlib.pyplot.hist`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib.AxesSubplot or numpy.ndarray of them
|
||||
|
||||
See Also
|
||||
--------
|
||||
matplotlib.pyplot.hist : Plot a histogram using matplotlib.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
.. plot::
|
||||
:context: close-figs
|
||||
|
||||
This example draws a histogram based on the length and width of
|
||||
some animals, displayed in three bins
|
||||
|
||||
>>> df = pd.DataFrame({
|
||||
... 'length': [1.5, 0.5, 1.2, 0.9, 3],
|
||||
... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]
|
||||
... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
|
||||
>>> hist = df.hist(bins=3)
|
||||
"""
|
||||
pass
|
||||
def idxmax(self, axis: AxisType = ..., skipna: bool = ...) -> Series:
|
||||
"""Return index of first occurrence of maximum over requested axis.
|
||||
|
||||
NA/null values are excluded.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||||
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||||
skipna : bool, default True
|
||||
Exclude NA/null values. If an entire row/column is NA, the result
|
||||
will be NA.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Indexes of maxima along the specified axis.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If the row/column is empty
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.idxmax
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method is the DataFrame version of ``ndarray.argmax``.
|
||||
"""
|
||||
pass
|
||||
def idxmin(self, axis: AxisType = ..., skipna: bool = ...) -> Series:
|
||||
"""Return index of first occurrence of minimum over requested axis.
|
||||
|
||||
NA/null values are excluded.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||||
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||||
skipna : bool, default True
|
||||
Exclude NA/null values. If an entire row/column is NA, the result
|
||||
will be NA.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Indexes of minima along the specified axis.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If the row/column is empty
|
||||
|
||||
See Also
|
||||
--------
|
||||
Series.idxmin
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method is the DataFrame version of ``ndarray.argmin``.
|
||||
"""
|
||||
pass
|
||||
def mad(
|
||||
self,
|
||||
axis: AxisType = ...,
|
||||
skipna: bool = ...,
|
||||
numeric_only: Optional[bool] = ...,
|
||||
*,
|
||||
level: Level,
|
||||
**kwargs
|
||||
) -> DataFrame:
|
||||
"""Return the mean absolute deviation of the values for the requested axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : {index (0), columns (1)}
|
||||
Axis for the function to be applied on.
|
||||
skipna : bool, default True
|
||||
Exclude NA/null values when computing the result.
|
||||
level : int or level name, default None
|
||||
If the axis is a MultiIndex (hierarchical), count along a
|
||||
particular level, collapsing into a Series.
|
||||
numeric_only : bool, default None
|
||||
Include only float, int, boolean columns. If None, will attempt to use
|
||||
everything, then use only numeric data. Not implemented for Series.
|
||||
**kwargs
|
||||
Additional keyword arguments to be passed to the function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series or DataFrame (if level specified)
|
||||
"""
|
||||
pass
|
||||
def skew(
|
||||
self, axis: AxisType = ..., skipna: bool = ..., numeric_only: bool = ..., *, level: Level, **kwargs
|
||||
) -> DataFrame:
|
||||
"""Return unbiased skew over requested axis.
|
||||
|
||||
Normalized by N-1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis : {index (0), columns (1)}
|
||||
Axis for the function to be applied on.
|
||||
skipna : bool, default True
|
||||
Exclude NA/null values when computing the result.
|
||||
level : int or level name, default None
|
||||
If the axis is a MultiIndex (hierarchical), count along a
|
||||
particular level, collapsing into a Series.
|
||||
numeric_only : bool, default None
|
||||
Include only float, int, boolean columns. If None, will attempt to use
|
||||
everything, then use only numeric data. Not implemented for Series.
|
||||
**kwargs
|
||||
Additional keyword arguments to be passed to the function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series or DataFrame (if level specified)
|
||||
"""
|
||||
pass
|
||||
def take(self, indices: Sequence, axis: AxisType = ..., **kwargs) -> DataFrame:
|
||||
"""Return the elements in the given *positional* indices along an axis.
|
||||
|
||||
This means that we are not indexing according to actual values in
|
||||
the index attribute of the object. We are indexing according to the
|
||||
actual position of the element in the object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
indices : array-like
|
||||
An array of ints indicating which positions to take.
|
||||
axis : {0 or 'index', 1 or 'columns', None}, default 0
|
||||
The axis on which to select elements. ``0`` means that we are
|
||||
selecting rows, ``1`` means that we are selecting columns.
|
||||
is_copy : bool
|
||||
Before pandas 1.0, ``is_copy=False`` can be specified to ensure
|
||||
that the return value is an actual copy. Starting with pandas 1.0,
|
||||
``take`` always returns a copy, and the keyword is therefore
|
||||
deprecated.
|
||||
|
||||
.. deprecated:: 1.0.0
|
||||
**kwargs
|
||||
For compatibility with :meth:`numpy.take`. Has no effect on the
|
||||
output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
taken : same type as caller
|
||||
An array-like containing the elements taken from the object.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.loc : Select a subset of a DataFrame by labels.
|
||||
DataFrame.iloc : Select a subset of a DataFrame by positions.
|
||||
numpy.take : Take elements from an array along an axis.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
|
||||
... ('parrot', 'bird', 24.0),
|
||||
... ('lion', 'mammal', 80.5),
|
||||
... ('monkey', 'mammal', np.nan)],
|
||||
... columns=['name', 'class', 'max_speed'],
|
||||
... index=[0, 2, 3, 1])
|
||||
>>> df
|
||||
name class max_speed
|
||||
0 falcon bird 389.0
|
||||
2 parrot bird 24.0
|
||||
3 lion mammal 80.5
|
||||
1 monkey mammal NaN
|
||||
|
||||
Take elements at positions 0 and 3 along the axis 0 (default).
|
||||
|
||||
Note how the actual indices selected (0 and 1) do not correspond to
|
||||
our selected indices 0 and 3. That's because we are selecting the 0th
|
||||
and 3rd rows, not rows whose indices equal 0 and 3.
|
||||
|
||||
>>> df.take([0, 3])
|
||||
name class max_speed
|
||||
0 falcon bird 389.0
|
||||
1 monkey mammal NaN
|
||||
|
||||
Take elements at indices 1 and 2 along the axis 1 (column selection).
|
||||
|
||||
>>> df.take([1, 2], axis=1)
|
||||
class max_speed
|
||||
0 bird 389.0
|
||||
2 bird 24.0
|
||||
3 mammal 80.5
|
||||
1 mammal NaN
|
||||
|
||||
We may take elements using negative integers for positive indices,
|
||||
starting from the end of the object, just like with Python lists.
|
||||
|
||||
>>> df.take([-1, -2])
|
||||
name class max_speed
|
||||
1 monkey mammal NaN
|
||||
3 lion mammal 80.5
|
||||
"""
|
||||
pass
|
|
@ -1,39 +0,0 @@
|
|||
|
||||
class Index(IndexOpsMixin[T1], PandasObject, Generic[T1]):
|
||||
def astype(self, dtype: _str) -> Index:
|
||||
"""Create an Index with values cast to dtypes. The class of a new Index
|
||||
is determined by dtype. When conversion is impossible, a ValueError
|
||||
exception is raised.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : numpy dtype or pandas type
|
||||
Note that any signed integer `dtype` is treated as ``'int64'``,
|
||||
and any unsigned integer `dtype` is treated as ``'uint64'``,
|
||||
regardless of the size.
|
||||
copy : bool, default True
|
||||
By default, astype always returns a newly allocated object.
|
||||
If copy is set to False and internal requirements on dtype are
|
||||
satisfied, the original data is used to create a new Index
|
||||
or the original Index is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Index
|
||||
Index with values cast to specified dtype.
|
||||
"""
|
||||
pass
|
||||
def is_monotonic_increasing(self) -> bool:
|
||||
"""Return if the index is monotonic increasing (only equal or
|
||||
increasing) values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> Index([1, 2, 3]).is_monotonic_increasing
|
||||
True
|
||||
>>> Index([1, 2, 2]).is_monotonic_increasing
|
||||
True
|
||||
>>> Index([1, 3, 2]).is_monotonic_increasing
|
||||
False
|
||||
"""
|
||||
pass
|
|
@ -1,102 +0,0 @@
|
|||
def melt(
|
||||
frame: DataFrame,
|
||||
id_vars: Optional[Union[Tuple, List, np.ndarray]] = ...,
|
||||
value_vars: Optional[Union[Tuple, List, np.ndarray]] = ...,
|
||||
var_name: Optional[str] = ...,
|
||||
value_name: str = ...,
|
||||
col_level: Optional[Union[int, str]] = ...,
|
||||
ignore_index: bool = ...
|
||||
) -> DataFrame:
|
||||
"""Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
|
||||
|
||||
This function is useful to massage a DataFrame into a format where one
|
||||
or more columns are identifier variables (`id_vars`), while all other
|
||||
columns, considered measured variables (`value_vars`), are "unpivoted" to
|
||||
the row axis, leaving just two non-identifier columns, 'variable' and
|
||||
'value'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
id_vars : tuple, list, or ndarray, optional
|
||||
Column(s) to use as identifier variables.
|
||||
value_vars : tuple, list, or ndarray, optional
|
||||
Column(s) to unpivot. If not specified, uses all columns that
|
||||
are not set as `id_vars`.
|
||||
var_name : scalar
|
||||
Name to use for the 'variable' column. If None it uses
|
||||
``frame.columns.name`` or 'variable'.
|
||||
value_name : scalar, default 'value'
|
||||
Name to use for the 'value' column.
|
||||
col_level : int or str, optional
|
||||
If columns are a MultiIndex then use this level to melt.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Unpivoted DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.melt
|
||||
pivot_table
|
||||
DataFrame.pivot
|
||||
Series.explode
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
|
||||
... 'B': {0: 1, 1: 3, 2: 5},
|
||||
... 'C': {0: 2, 1: 4, 2: 6}})
|
||||
>>> df
|
||||
A B C
|
||||
0 a 1 2
|
||||
1 b 3 4
|
||||
2 c 5 6
|
||||
|
||||
>>> pd.melt(df, id_vars=['A'], value_vars=['B'])
|
||||
A variable value
|
||||
0 a B 1
|
||||
1 b B 3
|
||||
2 c B 5
|
||||
|
||||
>>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
|
||||
A variable value
|
||||
0 a B 1
|
||||
1 b B 3
|
||||
2 c B 5
|
||||
3 a C 2
|
||||
4 b C 4
|
||||
5 c C 6
|
||||
|
||||
The names of 'variable' and 'value' columns can be customized:
|
||||
|
||||
>>> pd.melt(df, id_vars=['A'], value_vars=['B'],
|
||||
... var_name='myVarname', value_name='myValname')
|
||||
A myVarname myValname
|
||||
0 a B 1
|
||||
1 b B 3
|
||||
2 c B 5
|
||||
|
||||
If you have multi-index columns:
|
||||
|
||||
>>> df.columns = [list('ABC'), list('DEF')]
|
||||
>>> df
|
||||
A B C
|
||||
D E F
|
||||
0 a 1 2
|
||||
1 b 3 4
|
||||
2 c 5 6
|
||||
|
||||
>>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
|
||||
A variable value
|
||||
0 a B 1
|
||||
1 b B 3
|
||||
2 c B 5
|
||||
|
||||
>>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
|
||||
(A, D) variable_0 variable_1 value
|
||||
0 a B E 1
|
||||
1 b B E 3
|
||||
2 c B E 5
|
||||
"""
|
||||
pass
|
|
@ -1,158 +0,0 @@
|
|||
def merge(left: DataFrame,
|
||||
right: Union[DataFrame, Series],
|
||||
how: str = ...,
|
||||
on: Optional[Union[Label, Sequence]] = ...,
|
||||
left_on: Optional[Union[Label, Sequence]] = ...,
|
||||
right_on: Optional[Union[Label, Sequence]] = ...,
|
||||
left_index: bool = ...,
|
||||
right_index: bool = ...,
|
||||
sort: bool = ...,
|
||||
suffixes: Sequence[Union[str, None]] = ...,
|
||||
copy: bool = ...,
|
||||
indicator: Union[bool, str] = ...,
|
||||
validate: str = ...) -> DataFrame:
|
||||
"""Merge DataFrame or named Series objects with a database-style join.
|
||||
|
||||
The join is done on columns or indexes. If joining columns on
|
||||
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
|
||||
on indexes or indexes on a column or columns, the index will be passed on.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left : DataFrame
|
||||
right : DataFrame or named Series
|
||||
Object to merge with.
|
||||
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
|
||||
Type of merge to be performed.
|
||||
|
||||
* left: use only keys from left frame, similar to a SQL left outer join;
|
||||
preserve key order.
|
||||
* right: use only keys from right frame, similar to a SQL right outer join;
|
||||
preserve key order.
|
||||
* outer: use union of keys from both frames, similar to a SQL full outer
|
||||
join; sort keys lexicographically.
|
||||
* inner: use intersection of keys from both frames, similar to a SQL inner
|
||||
join; preserve the order of the left keys.
|
||||
on : label or list
|
||||
Column or index level names to join on. These must be found in both
|
||||
DataFrames. If `on` is None and not merging on indexes then this defaults
|
||||
to the intersection of the columns in both DataFrames.
|
||||
left_on : label or list, or array-like
|
||||
Column or index level names to join on in the left DataFrame. Can also
|
||||
be an array or list of arrays of the length of the left DataFrame.
|
||||
These arrays are treated as if they are columns.
|
||||
right_on : label or list, or array-like
|
||||
Column or index level names to join on in the right DataFrame. Can also
|
||||
be an array or list of arrays of the length of the right DataFrame.
|
||||
These arrays are treated as if they are columns.
|
||||
left_index : bool, default False
|
||||
Use the index from the left DataFrame as the join key(s). If it is a
|
||||
MultiIndex, the number of keys in the other DataFrame (either the index
|
||||
or a number of columns) must match the number of levels.
|
||||
right_index : bool, default False
|
||||
Use the index from the right DataFrame as the join key. Same caveats as
|
||||
left_index.
|
||||
sort : bool, default False
|
||||
Sort the join keys lexicographically in the result DataFrame. If False,
|
||||
the order of the join keys depends on the join type (how keyword).
|
||||
suffixes : tuple of (str, str), default ('_x', '_y')
|
||||
Suffix to apply to overlapping column names in the left and right
|
||||
side, respectively. To raise an exception on overlapping columns use
|
||||
(False, False).
|
||||
copy : bool, default True
|
||||
If False, avoid copy if possible.
|
||||
indicator : bool or str, default False
|
||||
If True, adds a column to output DataFrame called "_merge" with
|
||||
information on the source of each row.
|
||||
If string, column with information on source of each row will be added to
|
||||
output DataFrame, and column will be named value of string.
|
||||
Information column is Categorical-type and takes on a value of "left_only"
|
||||
for observations whose merge key only appears in 'left' DataFrame,
|
||||
"right_only" for observations whose merge key only appears in 'right'
|
||||
DataFrame, and "both" if the observation's merge key is found in both.
|
||||
|
||||
validate : str, optional
|
||||
If specified, checks if merge is of specified type.
|
||||
|
||||
* "one_to_one" or "1:1": check if merge keys are unique in both
|
||||
left and right datasets.
|
||||
* "one_to_many" or "1:m": check if merge keys are unique in left
|
||||
dataset.
|
||||
* "many_to_one" or "m:1": check if merge keys are unique in right
|
||||
dataset.
|
||||
* "many_to_many" or "m:m": allowed, but does not result in checks.
|
||||
|
||||
.. versionadded:: 0.21.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
A DataFrame of the two merged objects.
|
||||
|
||||
See Also
|
||||
--------
|
||||
merge_ordered : Merge with optional filling/interpolation.
|
||||
merge_asof : Merge on nearest keys.
|
||||
DataFrame.join : Similar method using indices.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Support for specifying index levels as the `on`, `left_on`, and
|
||||
`right_on` parameters was added in version 0.23.0
|
||||
Support for merging named Series objects was added in version 0.24.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
|
||||
... 'value': [1, 2, 3, 5]})
|
||||
>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
|
||||
... 'value': [5, 6, 7, 8]})
|
||||
>>> df1
|
||||
lkey value
|
||||
0 foo 1
|
||||
1 bar 2
|
||||
2 baz 3
|
||||
3 foo 5
|
||||
>>> df2
|
||||
rkey value
|
||||
0 foo 5
|
||||
1 bar 6
|
||||
2 baz 7
|
||||
3 foo 8
|
||||
|
||||
Merge df1 and df2 on the lkey and rkey columns. The value columns have
|
||||
the default suffixes, _x and _y, appended.
|
||||
|
||||
>>> df1.merge(df2, left_on='lkey', right_on='rkey')
|
||||
lkey value_x rkey value_y
|
||||
0 foo 1 foo 5
|
||||
1 foo 1 foo 8
|
||||
2 foo 5 foo 5
|
||||
3 foo 5 foo 8
|
||||
4 bar 2 bar 6
|
||||
5 baz 3 baz 7
|
||||
|
||||
Merge DataFrames df1 and df2 with specified left and right suffixes
|
||||
appended to any overlapping columns.
|
||||
|
||||
>>> df1.merge(df2, left_on='lkey', right_on='rkey',
|
||||
... suffixes=('_left', '_right'))
|
||||
lkey value_left rkey value_right
|
||||
0 foo 1 foo 5
|
||||
1 foo 1 foo 8
|
||||
2 foo 5 foo 5
|
||||
3 foo 5 foo 8
|
||||
4 bar 2 bar 6
|
||||
5 baz 3 baz 7
|
||||
|
||||
Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
|
||||
any overlapping columns.
|
||||
|
||||
>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: columns overlap but no suffix specified:
|
||||
Index(['value'], dtype='object')
|
||||
"""
|
||||
pass
|
|
@ -1,109 +0,0 @@
|
|||
def pivot(
|
||||
data: DataFrame,
|
||||
index: Optional[str] = ...,
|
||||
columns: Optional[str] = ...,
|
||||
values: Optional[Union[str, Sequence[str]]] = ...,
|
||||
) -> DataFrame:
|
||||
"""Return reshaped DataFrame organized by given index / column values.
|
||||
|
||||
Reshape data (produce a "pivot" table) based on column values. Uses
|
||||
unique values from specified `index` / `columns` to form axes of the
|
||||
resulting DataFrame. This function does not support data
|
||||
aggregation, multiple values will result in a MultiIndex in the
|
||||
columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : DataFrame
|
||||
index : str or object, optional
|
||||
Column to use to make new frame's index. If None, uses
|
||||
existing index.
|
||||
columns : str or object
|
||||
Column to use to make new frame's columns.
|
||||
values : str, object or a list of the previous, optional
|
||||
Column(s) to use for populating new frame's values. If not
|
||||
specified, all remaining columns will be used and the result will
|
||||
have hierarchically indexed columns.
|
||||
|
||||
.. versionchanged:: 0.23.0
|
||||
Also accept list of column names.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Returns reshaped DataFrame.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError:
|
||||
When there are any `index`, `columns` combinations with multiple
|
||||
values. `DataFrame.pivot_table` when you need to aggregate.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||||
duplicate values for one index/column pair.
|
||||
DataFrame.unstack : Pivot based on the index values instead of a
|
||||
column.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For finer-tuned control, see hierarchical indexing documentation along
|
||||
with the related stack/unstack methods.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
|
||||
... 'two'],
|
||||
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
|
||||
... 'baz': [1, 2, 3, 4, 5, 6],
|
||||
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
|
||||
>>> df
|
||||
foo bar baz zoo
|
||||
0 one A 1 x
|
||||
1 one B 2 y
|
||||
2 one C 3 z
|
||||
3 two A 4 q
|
||||
4 two B 5 w
|
||||
5 two C 6 t
|
||||
|
||||
>>> df.pivot(index='foo', columns='bar', values='baz')
|
||||
bar A B C
|
||||
foo
|
||||
one 1 2 3
|
||||
two 4 5 6
|
||||
|
||||
>>> df.pivot(index='foo', columns='bar')['baz']
|
||||
bar A B C
|
||||
foo
|
||||
one 1 2 3
|
||||
two 4 5 6
|
||||
|
||||
>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
|
||||
baz zoo
|
||||
bar A B C A B C
|
||||
foo
|
||||
one 1 2 3 x y z
|
||||
two 4 5 6 q w t
|
||||
|
||||
A ValueError is raised if there are any duplicates.
|
||||
|
||||
>>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
|
||||
... "bar": ['A', 'A', 'B', 'C'],
|
||||
... "baz": [1, 2, 3, 4]})
|
||||
>>> df
|
||||
foo bar baz
|
||||
0 one A 1
|
||||
1 one A 2
|
||||
2 two B 3
|
||||
3 two C 4
|
||||
|
||||
Notice that the first two rows are the same for our `index`
|
||||
and `columns` arguments.
|
||||
|
||||
>>> df.pivot(index='foo', columns='bar', values='baz')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Index contains duplicate entries, cannot reshape
|
||||
"""
|
||||
pass
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,263 +0,0 @@
|
|||
def read_excel(
|
||||
filepath: str,
|
||||
sheet_name: Optional[List[str]],
|
||||
header: Optional[Union[int, Sequence[int]]] = ...,
|
||||
names: Optional[Sequence[str]] = ...,
|
||||
index_col: Optional[Union[int, Sequence[int]]] = ...,
|
||||
usecols: Optional[Union[int, str, Sequence[Union[int, str, Callable]]]] = ...,
|
||||
squeeze: bool = ...,
|
||||
dtype: Union[str, Dict[str, Any], Dtype] = ...,
|
||||
engine: Optional[str] = ...,
|
||||
converters: Optional[Dict[Union[int, str], Callable]] = ...,
|
||||
true_values: Optional[Sequence[Scalar]] = ...,
|
||||
false_values: Optional[Sequence[Scalar]] = ...,
|
||||
skiprows: Optional[Union[Sequence[int], int, Callable]] = ...,
|
||||
nrows: Optional[int] = ...,
|
||||
na_values = ...,
|
||||
keep_default_na: bool = ...,
|
||||
verbose: bool = ...,
|
||||
parse_dates: Union[bool, Sequence, Dict[str, Sequence]] = ...,
|
||||
date_parser: Optional[Callable] = ...,
|
||||
thousands: Optional[str] = ...,
|
||||
comment: Optional[str] = ...,
|
||||
skipfooter: int = ...,
|
||||
convert_float: bool = ...,
|
||||
mangle_dupe_cols: bool = ...,
|
||||
) -> Dict[str, DataFrame]:
|
||||
"""
|
||||
Read an Excel file into a pandas DataFrame.
|
||||
|
||||
Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions
|
||||
read from a local filesystem or URL. Supports an option to read
|
||||
a single sheet or a list of sheets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
sheet_name : str, int, list, or None, default 0
|
||||
Strings are used for sheet names. Integers are used in zero-indexed
|
||||
sheet positions. Lists of strings/integers are used to request
|
||||
multiple sheets. Specify None to get all sheets.
|
||||
|
||||
Available cases:
|
||||
|
||||
* Defaults to ``0``: 1st sheet as a `DataFrame`
|
||||
* ``1``: 2nd sheet as a `DataFrame`
|
||||
* ``"Sheet1"``: Load sheet with name "Sheet1"
|
||||
* ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
|
||||
as a dict of `DataFrame`
|
||||
* None: All sheets.
|
||||
|
||||
header : int, list of int, default 0
|
||||
Row (0-indexed) to use for the column labels of the parsed
|
||||
DataFrame. If a list of integers is passed those row positions will
|
||||
be combined into a ``MultiIndex``. Use None if there is no header.
|
||||
names : array-like, default None
|
||||
List of column names to use. If file contains no header row,
|
||||
then you should explicitly pass header=None.
|
||||
index_col : int, list of int, default None
|
||||
Column (0-indexed) to use as the row labels of the DataFrame.
|
||||
Pass None if there is no such column. If a list is passed,
|
||||
those columns will be combined into a ``MultiIndex``. If a
|
||||
subset of data is selected with ``usecols``, index_col
|
||||
is based on the subset.
|
||||
usecols : int, str, list-like, or callable default None
|
||||
* If None, then parse all columns.
|
||||
* If str, then indicates comma separated list of Excel column letters
|
||||
and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
|
||||
both sides.
|
||||
* If list of int, then indicates list of column numbers to be parsed.
|
||||
* If list of string, then indicates list of column names to be parsed.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
* If callable, then evaluate each column name against it and parse the
|
||||
column if the callable returns ``True``.
|
||||
|
||||
Returns a subset of the columns according to behavior above.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
squeeze : bool, default False
|
||||
If the parsed data only contains one column then return a Series.
|
||||
dtype : Type name or dict of column -> type, default None
|
||||
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
|
||||
Use `object` to preserve data as stored in Excel and not interpret dtype.
|
||||
If converters are specified, they will be applied INSTEAD
|
||||
of dtype conversion.
|
||||
engine : str, default None
|
||||
If io is not a buffer or path, this must be set to identify io.
|
||||
Acceptable values are None, "xlrd", "openpyxl" or "odf".
|
||||
converters : dict, default None
|
||||
Dict of functions for converting values in certain columns. Keys can
|
||||
either be integers or column labels, values are functions that take one
|
||||
input argument, the Excel cell content, and return the transformed
|
||||
content.
|
||||
true_values : list, default None
|
||||
Values to consider as True.
|
||||
false_values : list, default None
|
||||
Values to consider as False.
|
||||
skiprows : list-like, int, or callable, optional
|
||||
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file.
|
||||
|
||||
If callable, the callable function will be evaluated against the row indices, returning True
|
||||
if the row should be skipped and False otherwise. An example of a valid callable argument
|
||||
would be lambda x: x in [0, 2].
|
||||
nrows : int, default None
|
||||
Number of rows to parse.
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
|
||||
na_values : scalar, str, list-like, or dict, default None
|
||||
Additional strings to recognize as NA/NaN. If dict passed, specific
|
||||
per-column NA values. By default the following values are interpreted
|
||||
as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
|
||||
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
|
||||
'nan', 'null'.
|
||||
keep_default_na : bool, default True
|
||||
Whether or not to include the default NaN values when parsing the data.
|
||||
Depending on whether `na_values` is passed in, the behavior is as follows:
|
||||
|
||||
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
|
||||
is appended to the default NaN values used for parsing.
|
||||
* If `keep_default_na` is True, and `na_values` are not specified, only
|
||||
the default NaN values are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are specified, only
|
||||
the NaN values specified `na_values` are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are not specified, no
|
||||
strings will be parsed as NaN.
|
||||
|
||||
Note that if `na_filter` is passed in as False, the `keep_default_na` and
|
||||
`na_values` parameters will be ignored.
|
||||
na_filter : bool, default True
|
||||
Detect missing value markers (empty strings and the value of na_values). In
|
||||
data without any NAs, passing na_filter=False can improve the performance
|
||||
of reading a large file.
|
||||
verbose : bool, default False
|
||||
Indicate number of NA values placed in non-numeric columns.
|
||||
parse_dates : bool, list-like, or dict, default False
|
||||
The behavior is as follows:
|
||||
|
||||
* bool. If True -> try parsing the index.
|
||||
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
|
||||
each as a separate date column.
|
||||
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
|
||||
a single date column.
|
||||
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
|
||||
result 'foo'
|
||||
|
||||
If a column or index contains an unparseable date, the entire column or
|
||||
index will be returned unaltered as an object data type. If you don`t want to
|
||||
parse some cells as date just change their type in Excel to "Text".
|
||||
For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
|
||||
|
||||
Note: A fast-path exists for iso8601-formatted dates.
|
||||
date_parser : function, optional
|
||||
Function to use for converting a sequence of string columns to an array of
|
||||
datetime instances. The default uses ``dateutil.parser.parser`` to do the
|
||||
conversion. Pandas will try to call `date_parser` in three different ways,
|
||||
advancing to the next if an exception occurs: 1) Pass one or more arrays
|
||||
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
|
||||
string values from the columns defined by `parse_dates` into a single array
|
||||
and pass that; and 3) call `date_parser` once for each row using one or
|
||||
more strings (corresponding to the columns defined by `parse_dates`) as
|
||||
arguments.
|
||||
thousands : str, default None
|
||||
Thousands separator for parsing string columns to numeric. Note that
|
||||
this parameter is only necessary for columns stored as TEXT in Excel,
|
||||
any numeric columns will automatically be parsed, regardless of display
|
||||
format.
|
||||
comment : str, default None
|
||||
Comments out remainder of line. Pass a character or characters to this
|
||||
argument to indicate comments in the input file. Any data between the
|
||||
comment string and the end of the current line is ignored.
|
||||
skipfooter : int, default 0
|
||||
Rows at the end to skip (0-indexed).
|
||||
convert_float : bool, default True
|
||||
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
|
||||
data will be read in as floats: Excel stores all numbers as floats
|
||||
internally.
|
||||
mangle_dupe_cols : bool, default True
|
||||
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
|
||||
'X'...'X'. Passing in False will cause data to be overwritten if there
|
||||
are duplicate names in the columns.
|
||||
**kwds : optional
|
||||
Optional keyword arguments can be passed to ``TextFileReader``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or dict of DataFrames
|
||||
DataFrame from the passed in Excel file. See notes in sheet_name
|
||||
argument for more information on when a dict of DataFrames is returned.
|
||||
|
||||
See Also
|
||||
--------
|
||||
to_excel : Write DataFrame to an Excel file.
|
||||
to_csv : Write DataFrame to a comma-separated values (csv) file.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
The file can be read using the file name as string or an open file object:
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1
|
||||
1 string2 2
|
||||
2 #Comment 3
|
||||
|
||||
>>> pd.read_excel(open('tmp.xlsx', 'rb'),
|
||||
... sheet_name='Sheet3') # doctest: +SKIP
|
||||
Unnamed: 0 Name Value
|
||||
0 0 string1 1
|
||||
1 1 string2 2
|
||||
2 2 #Comment 3
|
||||
|
||||
Index and header can be specified via the `index_col` and `header` arguments
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
|
||||
0 1 2
|
||||
0 NaN Name Value
|
||||
1 0.0 string1 1
|
||||
2 1.0 string2 2
|
||||
3 2.0 #Comment 3
|
||||
|
||||
Column types are inferred but can be explicitly specified
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0,
|
||||
... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1.0
|
||||
1 string2 2.0
|
||||
2 #Comment 3.0
|
||||
|
||||
True, False, and NA values, and thousands separators have defaults,
|
||||
but can be explicitly specified, too. Supply the values you would like
|
||||
as strings or lists of strings!
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0,
|
||||
... na_values=['string1', 'string2']) # doctest: +SKIP
|
||||
Name Value
|
||||
0 NaN 1
|
||||
1 NaN 2
|
||||
2 #Comment 3
|
||||
|
||||
Comment lines in the excel input file can be skipped using the `comment` kwarg
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1.0
|
||||
1 string2 2.0
|
||||
2 None NaN
|
||||
"""
|
||||
pass
|
|
@ -1,670 +0,0 @@
|
|||
def read_csv(
|
||||
reader: IO,
|
||||
sep: str = ...,
|
||||
delimiter: Optional[str] = ...,
|
||||
header: Union[int, Sequence[int], str, Literal["infer"]] = ...,
|
||||
names: Optional[Sequence[str]] = ...,
|
||||
index_col: Optional[Union[int, str, Sequence, str, Literal[False]]] = ...,
|
||||
usecols: Optional[Union[int, str, Sequence]] = ...,
|
||||
squeeze: bool = ...,
|
||||
prefix: Optional[str] = ...,
|
||||
mangle_dupe_cols: bool = ...,
|
||||
dtype: Optional[Union[str, Mapping[str, Any]]] = ...,
|
||||
engine: Optional[Union[str, Literal["c", "python"]]] = ...,
|
||||
converters: Optional[Mapping[Union[int, str], Callable]] = ...,
|
||||
true_values: Optional[Sequence[Scalar]] = ...,
|
||||
false_values: Optional[Sequence[Scalar]] = ...,
|
||||
skipinitialspace: bool = ...,
|
||||
skiprows: Optional[Union[Sequence, int, Callable]] = ...,
|
||||
skipfooter: int = ...,
|
||||
nrows: Optional[int] = ...,
|
||||
na_values = ...,
|
||||
keep_default_na: bool = ...,
|
||||
na_filter: bool = ...,
|
||||
verbose: bool = ...,
|
||||
skip_blank_lines: bool = ...,
|
||||
parse_dates: Union[bool, List[int], List[str]] = ...,
|
||||
infer_datetime_format: bool = ...,
|
||||
keep_date_col: bool = ...,
|
||||
date_parser: Optional[Callable] = ...,
|
||||
dayfirst: bool = ...,
|
||||
cache_dates: bool = ...,
|
||||
iterator: bool = ...,
|
||||
chunksize: Optional[int] = ...,
|
||||
compression: Optional[Union[str, Literal["infer", "gzip", "bz2", "zip", "xz"]]] = ...,
|
||||
thousands: Optional[str] = ...,
|
||||
decimal: Optional[str] = ...,
|
||||
lineterminator: Optional[str] = ...,
|
||||
quotechar: str = ...,
|
||||
quoting: int = ...,
|
||||
doublequote: bool = ...,
|
||||
escapechar: Optional[str] = ...,
|
||||
comment: Optional[str] = ...,
|
||||
encoding: Optional[str] = ...,
|
||||
dialect: Optional[str] = ...,
|
||||
error_bad_lines: bool = ...,
|
||||
warn_bad_lines: bool = ...,
|
||||
delim_whitespace: bool = ...,
|
||||
low_memory: bool = ...,
|
||||
memory_map: bool = ...,
|
||||
float_precision: Optional[str] = ...,
|
||||
) -> TextFileReader:
|
||||
"""Read a comma-separated values (csv) file into DataFrame.
|
||||
|
||||
Also supports optionally iterating or breaking of the file
|
||||
into chunks.
|
||||
|
||||
Additional help can be found in the online docs for
|
||||
`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: file://localhost/path/to/table.csv.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method, such as
|
||||
a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
|
||||
sep : str, default ','
|
||||
Delimiter to use. If sep is None, the C engine cannot automatically detect
|
||||
the separator, but the Python parsing engine can, meaning the latter will
|
||||
be used and automatically detect the separator by Python's builtin sniffer
|
||||
tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
|
||||
different from ``'\s+'`` will be interpreted as regular expressions and
|
||||
will also force the use of the Python parsing engine. Note that regex
|
||||
delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
|
||||
delimiter : str, default ``None``
|
||||
Alias for sep.
|
||||
header : int, list of int, default 'infer'
|
||||
Row number(s) to use as the column names, and the start of the
|
||||
data. Default behavior is to infer the column names: if no names
|
||||
are passed the behavior is identical to ``header=0`` and column
|
||||
names are inferred from the first line of the file, if column
|
||||
names are passed explicitly then the behavior is identical to
|
||||
``header=None``. Explicitly pass ``header=0`` to be able to
|
||||
replace existing names. The header can be a list of integers that
|
||||
specify row locations for a multi-index on the columns
|
||||
e.g. [0,1,3]. Intervening rows that are not specified will be
|
||||
skipped (e.g. 2 in this example is skipped). Note that this
|
||||
parameter ignores commented lines and empty lines if
|
||||
``skip_blank_lines=True``, so ``header=0`` denotes the first line of
|
||||
data rather than the first line of the file.
|
||||
names : array-like, optional
|
||||
List of column names to use. If the file contains a header row,
|
||||
then you should explicitly pass ``header=0`` to override the column names.
|
||||
Duplicates in this list are not allowed.
|
||||
index_col : int, str, sequence of int / str, or False, default ``None``
|
||||
Column(s) to use as the row labels of the ``DataFrame``, either given as
|
||||
string name or column index. If a sequence of int / str is given, a
|
||||
MultiIndex is used.
|
||||
|
||||
Note: ``index_col=False`` can be used to force pandas to *not* use the first
|
||||
column as the index, e.g. when you have a malformed file with delimiters at
|
||||
the end of each line.
|
||||
usecols : list-like or callable, optional
|
||||
Return a subset of the columns. If list-like, all elements must either
|
||||
be positional (i.e. integer indices into the document columns) or strings
|
||||
that correspond to column names provided either by the user in `names` or
|
||||
inferred from the document header row(s). For example, a valid list-like
|
||||
`usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
|
||||
Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
|
||||
To instantiate a DataFrame from ``data`` with element order preserved use
|
||||
``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
|
||||
in ``['foo', 'bar']`` order or
|
||||
``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
|
||||
for ``['bar', 'foo']`` order.
|
||||
|
||||
If callable, the callable function will be evaluated against the column
|
||||
names, returning names where the callable function evaluates to True. An
|
||||
example of a valid callable argument would be ``lambda x: x.upper() in
|
||||
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
|
||||
parsing time and lower memory usage.
|
||||
squeeze : bool, default False
|
||||
If the parsed data only contains one column then return a Series.
|
||||
prefix : str, optional
|
||||
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
|
||||
mangle_dupe_cols : bool, default True
|
||||
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
|
||||
'X'...'X'. Passing in False will cause data to be overwritten if there
|
||||
are duplicate names in the columns.
|
||||
dtype : Type name or dict of column -> type, optional
|
||||
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
|
||||
'c': 'Int64'}
|
||||
Use `str` or `object` together with suitable `na_values` settings
|
||||
to preserve and not interpret dtype.
|
||||
If converters are specified, they will be applied INSTEAD
|
||||
of dtype conversion.
|
||||
engine : {'c', 'python'}, optional
|
||||
Parser engine to use. The C engine is faster while the python engine is
|
||||
currently more feature-complete.
|
||||
converters : dict, optional
|
||||
Dict of functions for converting values in certain columns. Keys can either
|
||||
be integers or column labels.
|
||||
true_values : list, optional
|
||||
Values to consider as True.
|
||||
false_values : list, optional
|
||||
Values to consider as False.
|
||||
skipinitialspace : bool, default False
|
||||
Skip spaces after delimiter.
|
||||
skiprows : list-like, int or callable, optional
|
||||
Line numbers to skip (0-indexed) or number of lines to skip (int)
|
||||
at the start of the file.
|
||||
|
||||
If callable, the callable function will be evaluated against the row
|
||||
indices, returning True if the row should be skipped and False otherwise.
|
||||
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
|
||||
skipfooter : int, default 0
|
||||
Number of lines at bottom of file to skip (Unsupported with engine='c').
|
||||
nrows : int, optional
|
||||
Number of rows of file to read. Useful for reading pieces of large files.
|
||||
na_values : scalar, str, list-like, or dict, optional
|
||||
Additional strings to recognize as NA/NaN. If dict passed, specific
|
||||
per-column NA values. By default the following values are interpreted as
|
||||
NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
|
||||
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
|
||||
'nan', 'null'.
|
||||
keep_default_na : bool, default True
|
||||
Whether or not to include the default NaN values when parsing the data.
|
||||
Depending on whether `na_values` is passed in, the behavior is as follows:
|
||||
|
||||
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
|
||||
is appended to the default NaN values used for parsing.
|
||||
* If `keep_default_na` is True, and `na_values` are not specified, only
|
||||
the default NaN values are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are specified, only
|
||||
the NaN values specified `na_values` are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are not specified, no
|
||||
strings will be parsed as NaN.
|
||||
|
||||
Note that if `na_filter` is passed in as False, the `keep_default_na` and
|
||||
`na_values` parameters will be ignored.
|
||||
na_filter : bool, default True
|
||||
Detect missing value markers (empty strings and the value of na_values). In
|
||||
data without any NAs, passing na_filter=False can improve the performance
|
||||
of reading a large file.
|
||||
verbose : bool, default False
|
||||
Indicate number of NA values placed in non-numeric columns.
|
||||
skip_blank_lines : bool, default True
|
||||
If True, skip over blank lines rather than interpreting as NaN values.
|
||||
parse_dates : bool or list of int or names or list of lists or dict, default False
|
||||
The behavior is as follows:
|
||||
|
||||
* boolean. If True -> try parsing the index.
|
||||
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
|
||||
each as a separate date column.
|
||||
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
|
||||
a single date column.
|
||||
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
|
||||
result 'foo'
|
||||
|
||||
If a column or index cannot be represented as an array of datetimes,
|
||||
say because of an unparseable value or a mixture of timezones, the column
|
||||
or index will be returned unaltered as an object data type. For
|
||||
non-standard datetime parsing, use ``pd.to_datetime`` after
|
||||
``pd.read_csv``. To parse an index or column with a mixture of timezones,
|
||||
specify ``date_parser`` to be a partially-applied
|
||||
:func:`pandas.to_datetime` with ``utc=True``. See
|
||||
:ref:`io.csv.mixed_timezones` for more.
|
||||
|
||||
Note: A fast-path exists for iso8601-formatted dates.
|
||||
infer_datetime_format : bool, default False
|
||||
If True and `parse_dates` is enabled, pandas will attempt to infer the
|
||||
format of the datetime strings in the columns, and if it can be inferred,
|
||||
switch to a faster method of parsing them. In some cases this can increase
|
||||
the parsing speed by 5-10x.
|
||||
keep_date_col : bool, default False
|
||||
If True and `parse_dates` specifies combining multiple columns then
|
||||
keep the original columns.
|
||||
date_parser : function, optional
|
||||
Function to use for converting a sequence of string columns to an array of
|
||||
datetime instances. The default uses ``dateutil.parser.parser`` to do the
|
||||
conversion. Pandas will try to call `date_parser` in three different ways,
|
||||
advancing to the next if an exception occurs: 1) Pass one or more arrays
|
||||
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
|
||||
string values from the columns defined by `parse_dates` into a single array
|
||||
and pass that; and 3) call `date_parser` once for each row using one or
|
||||
more strings (corresponding to the columns defined by `parse_dates`) as
|
||||
arguments.
|
||||
dayfirst : bool, default False
|
||||
DD/MM format dates, international and European format.
|
||||
cache_dates : bool, default True
|
||||
If True, use a cache of unique, converted dates to apply the datetime
|
||||
conversion. May produce significant speed-up when parsing duplicate
|
||||
date strings, especially ones with timezone offsets.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
iterator : bool, default False
|
||||
Return TextFileReader object for iteration or getting chunks with
|
||||
``get_chunk()``.
|
||||
chunksize : int, optional
|
||||
Return TextFileReader object for iteration.
|
||||
See the `IO Tools docs
|
||||
<https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
|
||||
for more information on ``iterator`` and ``chunksize``.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer' and
|
||||
`filepath_or_buffer` is path-like, then detect compression from the
|
||||
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
|
||||
decompression). If using 'zip', the ZIP file must contain only one data
|
||||
file to be read in. Set to None for no decompression.
|
||||
thousands : str, optional
|
||||
Thousands separator.
|
||||
decimal : str, default '.'
|
||||
Character to recognize as decimal point (e.g. use ',' for European data).
|
||||
lineterminator : str (length 1), optional
|
||||
Character to break file into lines. Only valid with C parser.
|
||||
quotechar : str (length 1), optional
|
||||
The character used to denote the start and end of a quoted item. Quoted
|
||||
items can include the delimiter and it will be ignored.
|
||||
quoting : int or csv.QUOTE_* instance, default 0
|
||||
Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
|
||||
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
|
||||
doublequote : bool, default ``True``
|
||||
When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
|
||||
whether or not to interpret two consecutive quotechar elements INSIDE a
|
||||
field as a single ``quotechar`` element.
|
||||
escapechar : str (length 1), optional
|
||||
One-character string used to escape other characters.
|
||||
comment : str, optional
|
||||
Indicates remainder of line should not be parsed. If found at the beginning
|
||||
of a line, the line will be ignored altogether. This parameter must be a
|
||||
single character. Like empty lines (as long as ``skip_blank_lines=True``),
|
||||
fully commented lines are ignored by the parameter `header` but not by
|
||||
`skiprows`. For example, if ``comment='#'``, parsing
|
||||
``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
|
||||
treated as the header.
|
||||
encoding : str, optional
|
||||
Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
|
||||
standard encodings
|
||||
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
|
||||
dialect : str or csv.Dialect, optional
|
||||
If provided, this parameter will override values (default or not) for the
|
||||
following parameters: `delimiter`, `doublequote`, `escapechar`,
|
||||
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
|
||||
override values, a ParserWarning will be issued. See csv.Dialect
|
||||
documentation for more details.
|
||||
error_bad_lines : bool, default True
|
||||
Lines with too many fields (e.g. a csv line with too many commas) will by
|
||||
default cause an exception to be raised, and no DataFrame will be returned.
|
||||
If False, then these "bad lines" will dropped from the DataFrame that is
|
||||
returned.
|
||||
warn_bad_lines : bool, default True
|
||||
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
|
||||
"bad line" will be output.
|
||||
delim_whitespace : bool, default False
|
||||
Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be
|
||||
used as the sep. Equivalent to setting ``sep='\s+'``. If this option
|
||||
is set to True, nothing should be passed in for the ``delimiter``
|
||||
parameter.
|
||||
low_memory : bool, default True
|
||||
Internally process the file in chunks, resulting in lower memory use
|
||||
while parsing, but possibly mixed type inference. To ensure no mixed
|
||||
types either set False, or specify the type with the `dtype` parameter.
|
||||
Note that the entire file is read into a single DataFrame regardless,
|
||||
use the `chunksize` or `iterator` parameter to return the data in chunks.
|
||||
(Only valid with C parser).
|
||||
memory_map : bool, default False
|
||||
If a filepath is provided for `filepath_or_buffer`, map the file object
|
||||
directly onto memory and access the data directly from there. Using this
|
||||
option can improve performance because there is no longer any I/O overhead.
|
||||
float_precision : str, optional
|
||||
Specifies which converter the C engine should use for floating-point
|
||||
values. The options are `None` for the ordinary converter,
|
||||
`high` for the high-precision converter, and `round_trip` for the
|
||||
round-trip converter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or TextParser
|
||||
A comma-separated values (csv) file is returned as two-dimensional
|
||||
data structure with labeled axes.
|
||||
|
||||
See Also
|
||||
--------
|
||||
to_csv : Write DataFrame to a comma-separated values (csv) file.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.read_csv('data.csv') # doctest: +SKIP
|
||||
"""
|
||||
pass
|
||||
def read_table(
|
||||
reader: IO,
|
||||
sep: str = ...,
|
||||
delimiter: Optional[str] = ...,
|
||||
header: Union[int, Sequence[int], str, Literal["infer"]] = ...,
|
||||
names: Optional[Sequence[str]] = ...,
|
||||
index_col: Optional[Union[int, str, Sequence, bool, Literal[False]]] = ...,
|
||||
usecols: Optional[Union[int, str, Sequence]] = ...,
|
||||
squeeze: bool = ...,
|
||||
prefix: Optional[str] = ...,
|
||||
mangle_dupe_cols: bool = ...,
|
||||
dtype: Optional[Union[str, Mapping[str, Any]]] = ...,
|
||||
engine: Optional[Union[str, Literal["c", "python"]]] = ...,
|
||||
converters: Optional[Mapping[Union[int, str], Callable]] = ...,
|
||||
true_values: Optional[Sequence[Scalar]] = ...,
|
||||
false_values: Optional[Sequence[Scalar]] = ...,
|
||||
skipinitialspace: bool = ...,
|
||||
skiprows: Optional[Union[Sequence, int, Callable]] = ...,
|
||||
skipfooter: int = ...,
|
||||
nrows: Optional[int] = ...,
|
||||
na_values = ...,
|
||||
keep_default_na: bool = ...,
|
||||
na_filter: bool = ...,
|
||||
verbose: bool = ...,
|
||||
skip_blank_lines: bool = ...,
|
||||
parse_dates: Union[bool, List[int], List[str]] = ...,
|
||||
infer_datetime_format: bool = ...,
|
||||
keep_date_col: bool = ...,
|
||||
date_parser: Optional[Callable] = ...,
|
||||
dayfirst: bool = ...,
|
||||
cache_dates: bool = ...,
|
||||
iterator: bool = ...,
|
||||
chunksize: Optional[int] = ...,
|
||||
compression: Optional[Union[str, Literal["infer", "gzip", "bz2", "zip", "xz"]]] = ...,
|
||||
thousands: Optional[str] = ...,
|
||||
decimal: Optional[str] = ...,
|
||||
lineterminator: Optional[str] = ...,
|
||||
quotechar: str = ...,
|
||||
quoting: int = ...,
|
||||
doublequote: bool = ...,
|
||||
escapechar: Optional[str] = ...,
|
||||
comment: Optional[str] = ...,
|
||||
encoding: Optional[str] = ...,
|
||||
dialect: Optional[str] = ...,
|
||||
error_bad_lines: bool = ...,
|
||||
warn_bad_lines: bool = ...,
|
||||
delim_whitespace: bool = ...,
|
||||
low_memory: bool = ...,
|
||||
memory_map: bool = ...,
|
||||
float_precision: Optional[str] = ...,
|
||||
) -> TextFileReader:
|
||||
"""Read general delimited file into DataFrame.
|
||||
|
||||
Also supports optionally iterating or breaking of the file
|
||||
into chunks.
|
||||
|
||||
Additional help can be found in the online docs for
|
||||
`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: file://localhost/path/to/table.csv.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method, such as
|
||||
a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
|
||||
sep : str, default '\\t' (tab-stop)
|
||||
Delimiter to use. If sep is None, the C engine cannot automatically detect
|
||||
the separator, but the Python parsing engine can, meaning the latter will
|
||||
be used and automatically detect the separator by Python's builtin sniffer
|
||||
tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
|
||||
different from ``'\s+'`` will be interpreted as regular expressions and
|
||||
will also force the use of the Python parsing engine. Note that regex
|
||||
delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
|
||||
delimiter : str, default ``None``
|
||||
Alias for sep.
|
||||
header : int, list of int, default 'infer'
|
||||
Row number(s) to use as the column names, and the start of the
|
||||
data. Default behavior is to infer the column names: if no names
|
||||
are passed the behavior is identical to ``header=0`` and column
|
||||
names are inferred from the first line of the file, if column
|
||||
names are passed explicitly then the behavior is identical to
|
||||
``header=None``. Explicitly pass ``header=0`` to be able to
|
||||
replace existing names. The header can be a list of integers that
|
||||
specify row locations for a multi-index on the columns
|
||||
e.g. [0,1,3]. Intervening rows that are not specified will be
|
||||
skipped (e.g. 2 in this example is skipped). Note that this
|
||||
parameter ignores commented lines and empty lines if
|
||||
``skip_blank_lines=True``, so ``header=0`` denotes the first line of
|
||||
data rather than the first line of the file.
|
||||
names : array-like, optional
|
||||
List of column names to use. If the file contains a header row,
|
||||
then you should explicitly pass ``header=0`` to override the column names.
|
||||
Duplicates in this list are not allowed.
|
||||
index_col : int, str, sequence of int / str, or False, default ``None``
|
||||
Column(s) to use as the row labels of the ``DataFrame``, either given as
|
||||
string name or column index. If a sequence of int / str is given, a
|
||||
MultiIndex is used.
|
||||
|
||||
Note: ``index_col=False`` can be used to force pandas to *not* use the first
|
||||
column as the index, e.g. when you have a malformed file with delimiters at
|
||||
the end of each line.
|
||||
usecols : list-like or callable, optional
|
||||
Return a subset of the columns. If list-like, all elements must either
|
||||
be positional (i.e. integer indices into the document columns) or strings
|
||||
that correspond to column names provided either by the user in `names` or
|
||||
inferred from the document header row(s). For example, a valid list-like
|
||||
`usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
|
||||
Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
|
||||
To instantiate a DataFrame from ``data`` with element order preserved use
|
||||
``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
|
||||
in ``['foo', 'bar']`` order or
|
||||
``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
|
||||
for ``['bar', 'foo']`` order.
|
||||
|
||||
If callable, the callable function will be evaluated against the column
|
||||
names, returning names where the callable function evaluates to True. An
|
||||
example of a valid callable argument would be ``lambda x: x.upper() in
|
||||
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
|
||||
parsing time and lower memory usage.
|
||||
squeeze : bool, default False
|
||||
If the parsed data only contains one column then return a Series.
|
||||
prefix : str, optional
|
||||
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
|
||||
mangle_dupe_cols : bool, default True
|
||||
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
|
||||
'X'...'X'. Passing in False will cause data to be overwritten if there
|
||||
are duplicate names in the columns.
|
||||
dtype : Type name or dict of column -> type, optional
|
||||
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
|
||||
'c': 'Int64'}
|
||||
Use `str` or `object` together with suitable `na_values` settings
|
||||
to preserve and not interpret dtype.
|
||||
If converters are specified, they will be applied INSTEAD
|
||||
of dtype conversion.
|
||||
engine : {'c', 'python'}, optional
|
||||
Parser engine to use. The C engine is faster while the python engine is
|
||||
currently more feature-complete.
|
||||
converters : dict, optional
|
||||
Dict of functions for converting values in certain columns. Keys can either
|
||||
be integers or column labels.
|
||||
true_values : list, optional
|
||||
Values to consider as True.
|
||||
false_values : list, optional
|
||||
Values to consider as False.
|
||||
skipinitialspace : bool, default False
|
||||
Skip spaces after delimiter.
|
||||
skiprows : list-like, int or callable, optional
|
||||
Line numbers to skip (0-indexed) or number of lines to skip (int)
|
||||
at the start of the file.
|
||||
|
||||
If callable, the callable function will be evaluated against the row
|
||||
indices, returning True if the row should be skipped and False otherwise.
|
||||
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
|
||||
skipfooter : int, default 0
|
||||
Number of lines at bottom of file to skip (Unsupported with engine='c').
|
||||
nrows : int, optional
|
||||
Number of rows of file to read. Useful for reading pieces of large files.
|
||||
na_values : scalar, str, list-like, or dict, optional
|
||||
Additional strings to recognize as NA/NaN. If dict passed, specific
|
||||
per-column NA values. By default the following values are interpreted as
|
||||
NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
|
||||
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
|
||||
'nan', 'null'.
|
||||
keep_default_na : bool, default True
|
||||
Whether or not to include the default NaN values when parsing the data.
|
||||
Depending on whether `na_values` is passed in, the behavior is as follows:
|
||||
|
||||
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
|
||||
is appended to the default NaN values used for parsing.
|
||||
* If `keep_default_na` is True, and `na_values` are not specified, only
|
||||
the default NaN values are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are specified, only
|
||||
the NaN values specified `na_values` are used for parsing.
|
||||
* If `keep_default_na` is False, and `na_values` are not specified, no
|
||||
strings will be parsed as NaN.
|
||||
|
||||
Note that if `na_filter` is passed in as False, the `keep_default_na` and
|
||||
`na_values` parameters will be ignored.
|
||||
na_filter : bool, default True
|
||||
Detect missing value markers (empty strings and the value of na_values). In
|
||||
data without any NAs, passing na_filter=False can improve the performance
|
||||
of reading a large file.
|
||||
verbose : bool, default False
|
||||
Indicate number of NA values placed in non-numeric columns.
|
||||
skip_blank_lines : bool, default True
|
||||
If True, skip over blank lines rather than interpreting as NaN values.
|
||||
parse_dates : bool or list of int or names or list of lists or dict, default False
|
||||
The behavior is as follows:
|
||||
|
||||
* boolean. If True -> try parsing the index.
|
||||
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
|
||||
each as a separate date column.
|
||||
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
|
||||
a single date column.
|
||||
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
|
||||
result 'foo'
|
||||
|
||||
If a column or index cannot be represented as an array of datetimes,
|
||||
say because of an unparseable value or a mixture of timezones, the column
|
||||
or index will be returned unaltered as an object data type. For
|
||||
non-standard datetime parsing, use ``pd.to_datetime`` after
|
||||
``pd.read_csv``. To parse an index or column with a mixture of timezones,
|
||||
specify ``date_parser`` to be a partially-applied
|
||||
:func:`pandas.to_datetime` with ``utc=True``. See
|
||||
:ref:`io.csv.mixed_timezones` for more.
|
||||
|
||||
Note: A fast-path exists for iso8601-formatted dates.
|
||||
infer_datetime_format : bool, default False
|
||||
If True and `parse_dates` is enabled, pandas will attempt to infer the
|
||||
format of the datetime strings in the columns, and if it can be inferred,
|
||||
switch to a faster method of parsing them. In some cases this can increase
|
||||
the parsing speed by 5-10x.
|
||||
keep_date_col : bool, default False
|
||||
If True and `parse_dates` specifies combining multiple columns then
|
||||
keep the original columns.
|
||||
date_parser : function, optional
|
||||
Function to use for converting a sequence of string columns to an array of
|
||||
datetime instances. The default uses ``dateutil.parser.parser`` to do the
|
||||
conversion. Pandas will try to call `date_parser` in three different ways,
|
||||
advancing to the next if an exception occurs: 1) Pass one or more arrays
|
||||
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
|
||||
string values from the columns defined by `parse_dates` into a single array
|
||||
and pass that; and 3) call `date_parser` once for each row using one or
|
||||
more strings (corresponding to the columns defined by `parse_dates`) as
|
||||
arguments.
|
||||
dayfirst : bool, default False
|
||||
DD/MM format dates, international and European format.
|
||||
cache_dates : bool, default True
|
||||
If True, use a cache of unique, converted dates to apply the datetime
|
||||
conversion. May produce significant speed-up when parsing duplicate
|
||||
date strings, especially ones with timezone offsets.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
iterator : bool, default False
|
||||
Return TextFileReader object for iteration or getting chunks with
|
||||
``get_chunk()``.
|
||||
chunksize : int, optional
|
||||
Return TextFileReader object for iteration.
|
||||
See the `IO Tools docs
|
||||
<https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
|
||||
for more information on ``iterator`` and ``chunksize``.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer' and
|
||||
`filepath_or_buffer` is path-like, then detect compression from the
|
||||
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
|
||||
decompression). If using 'zip', the ZIP file must contain only one data
|
||||
file to be read in. Set to None for no decompression.
|
||||
thousands : str, optional
|
||||
Thousands separator.
|
||||
decimal : str, default '.'
|
||||
Character to recognize as decimal point (e.g. use ',' for European data).
|
||||
lineterminator : str (length 1), optional
|
||||
Character to break file into lines. Only valid with C parser.
|
||||
quotechar : str (length 1), optional
|
||||
The character used to denote the start and end of a quoted item. Quoted
|
||||
items can include the delimiter and it will be ignored.
|
||||
quoting : int or csv.QUOTE_* instance, default 0
|
||||
Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
|
||||
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
|
||||
doublequote : bool, default ``True``
|
||||
When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
|
||||
whether or not to interpret two consecutive quotechar elements INSIDE a
|
||||
field as a single ``quotechar`` element.
|
||||
escapechar : str (length 1), optional
|
||||
One-character string used to escape other characters.
|
||||
comment : str, optional
|
||||
Indicates remainder of line should not be parsed. If found at the beginning
|
||||
of a line, the line will be ignored altogether. This parameter must be a
|
||||
single character. Like empty lines (as long as ``skip_blank_lines=True``),
|
||||
fully commented lines are ignored by the parameter `header` but not by
|
||||
`skiprows`. For example, if ``comment='#'``, parsing
|
||||
``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
|
||||
treated as the header.
|
||||
encoding : str, optional
|
||||
Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
|
||||
standard encodings
|
||||
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
|
||||
dialect : str or csv.Dialect, optional
|
||||
If provided, this parameter will override values (default or not) for the
|
||||
following parameters: `delimiter`, `doublequote`, `escapechar`,
|
||||
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
|
||||
override values, a ParserWarning will be issued. See csv.Dialect
|
||||
documentation for more details.
|
||||
error_bad_lines : bool, default True
|
||||
Lines with too many fields (e.g. a csv line with too many commas) will by
|
||||
default cause an exception to be raised, and no DataFrame will be returned.
|
||||
If False, then these "bad lines" will dropped from the DataFrame that is
|
||||
returned.
|
||||
warn_bad_lines : bool, default True
|
||||
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
|
||||
"bad line" will be output.
|
||||
delim_whitespace : bool, default False
|
||||
Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be
|
||||
used as the sep. Equivalent to setting ``sep='\s+'``. If this option
|
||||
is set to True, nothing should be passed in for the ``delimiter``
|
||||
parameter.
|
||||
low_memory : bool, default True
|
||||
Internally process the file in chunks, resulting in lower memory use
|
||||
while parsing, but possibly mixed type inference. To ensure no mixed
|
||||
types either set False, or specify the type with the `dtype` parameter.
|
||||
Note that the entire file is read into a single DataFrame regardless,
|
||||
use the `chunksize` or `iterator` parameter to return the data in chunks.
|
||||
(Only valid with C parser).
|
||||
memory_map : bool, default False
|
||||
If a filepath is provided for `filepath_or_buffer`, map the file object
|
||||
directly onto memory and access the data directly from there. Using this
|
||||
option can improve performance because there is no longer any I/O overhead.
|
||||
float_precision : str, optional
|
||||
Specifies which converter the C engine should use for floating-point
|
||||
values. The options are `None` for the ordinary converter,
|
||||
`high` for the high-precision converter, and `round_trip` for the
|
||||
round-trip converter.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or TextParser
|
||||
A comma-separated values (csv) file is returned as two-dimensional
|
||||
data structure with labeled axes.
|
||||
|
||||
See Also
|
||||
--------
|
||||
to_csv : Write DataFrame to a comma-separated values (csv) file.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> pd.read_table('data.csv') # doctest: +SKIP
|
||||
"""
|
||||
pass
|
|
@ -1,74 +0,0 @@
|
|||
def read_stata(
|
||||
path: FilePathOrBuffer,
|
||||
convert_dates: bool = ...,
|
||||
convert_categoricals: bool = ...,
|
||||
index_col: Optional[str] = ...,
|
||||
convert_missing: bool = ...,
|
||||
preserve_dtypes: bool = ...,
|
||||
columns: Optional[Sequence[str]] = ...,
|
||||
order_categoricals: bool = ...,
|
||||
chunksize: Optional[int] = ...,
|
||||
iterator: bool = ...,
|
||||
) -> DataFrame:
|
||||
"""Read Stata file into DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: ``file://localhost/path/to/table.dta``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
convert_dates : bool, default True
|
||||
Convert date variables to DataFrame time values.
|
||||
convert_categoricals : bool, default True
|
||||
Read value labels and convert columns to Categorical/Factor variables.
|
||||
index_col : str, optional
|
||||
Column to set as index.
|
||||
convert_missing : bool, default False
|
||||
Flag indicating whether to convert missing values to their Stata
|
||||
representations. If False, missing values are replaced with nan.
|
||||
If True, columns containing missing values are returned with
|
||||
object data types and missing values are represented by
|
||||
StataMissingValue objects.
|
||||
preserve_dtypes : bool, default True
|
||||
Preserve Stata datatypes. If False, numeric data are upcast to pandas
|
||||
default types for foreign data (float64 or int64).
|
||||
columns : list or None
|
||||
Columns to retain. Columns will be returned in the given order. None
|
||||
returns all columns.
|
||||
order_categoricals : bool, default True
|
||||
Flag indicating whether converted categorical data are ordered.
|
||||
chunksize : int, default None
|
||||
Return StataReader object for iterations, returns chunks with
|
||||
given number of lines.
|
||||
iterator : bool, default False
|
||||
Return StataReader object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or StataReader
|
||||
|
||||
See Also
|
||||
--------
|
||||
io.stata.StataReader : Low-level reader for Stata data files.
|
||||
DataFrame.to_stata: Export Stata data files.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a Stata dta file:
|
||||
|
||||
>>> df = pd.read_stata('filename.dta')
|
||||
|
||||
Read a Stata dta file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_stata('filename.dta', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
... do_something(chunk)
|
||||
"""
|
||||
pass
|
|
@ -0,0 +1,14 @@
|
|||
#!/bin/sh
|
||||
|
||||
python -m pip install --upgrade pandas
|
||||
mkdir -p stubs
|
||||
for d in cv2-stubs django gym-stubs jmespath matplotlib openpyxl pandas pythonwin-stubs scipy-stubs sklearn-stubs sqlalchemy sympy-stubs transformers-stubs win32-stubs win32comext-stubs
|
||||
do
|
||||
cp -R ../../$d stubs
|
||||
done
|
||||
(cd ../docify; rm -rf .eggs; PBR_VERSION=1.0.0 pip install .)
|
||||
(cd stubs/pandas; docify ../../docify-pandas.cfg)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
pandas,core/algorithms.pyi,,factorize
|
||||
pandas,core/frame.pyi,DataFrame,align
|
||||
pandas,core/frame.pyi,DataFrame,fillna
|
||||
pandas,core/frame.pyi,DataFrame,groupby
|
||||
pandas,core/frame.pyi,DataFrame,isna
|
||||
pandas,core/frame.pyi,DataFrame,isnull
|
||||
pandas,core/frame.pyi,DataFrame,items
|
||||
pandas,core/frame.pyi,DataFrame,iteritems
|
||||
pandas,core/frame.pyi,DataFrame,notna
|
||||
pandas,core/frame.pyi,DataFrame,notnull
|
||||
pandas,core/frame.pyi,DataFrame,pivot
|
||||
pandas,core/frame.pyi,DataFrame,reindex
|
||||
pandas,core/frame.pyi,DataFrame,replace
|
||||
pandas,core/frame.pyi,DataFrame,shift
|
||||
pandas,core/frame.pyi,DataFrame,sort_index
|
||||
pandas,core/frame.pyi,DataFrame,sort_values
|
||||
pandas,core/frame.pyi,DataFrame,to_markdown
|
||||
pandas,core/series.pyi,Series,aggregate
|
||||
pandas,core/series.pyi,Series,align
|
||||
pandas,core/series.pyi,Series,fillna
|
||||
pandas,core/series.pyi,Series,groupby
|
||||
pandas,core/series.pyi,Series,isna
|
||||
pandas,core/series.pyi,Series,isnull
|
||||
pandas,core/series.pyi,Series,iteritems
|
||||
pandas,core/series.pyi,Series,notna
|
||||
pandas,core/series.pyi,Series,notnull
|
||||
pandas,core/series.pyi,Series,reindex
|
||||
pandas,core/series.pyi,Series,replace
|
||||
pandas,core/series.pyi,Series,searchsorted
|
||||
pandas,core/series.pyi,Series,shift
|
||||
pandas,core/series.pyi,Series,take
|
||||
pandas,core/series.pyi,Series,to_markdown
|
||||
pandas,core/series.pyi,Series,transform
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,aggregate
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,boxplot
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,fillna
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,hist
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,idxmax
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,idxmin
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,mad
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,skew
|
||||
pandas,core/groupby/generic.pyi,DataFrameGroupBy,take
|
||||
pandas,core/indexes/base.pyi,Index,astype
|
||||
pandas,core/indexes/base.pyi,Index,is_monotonic_increasing
|
||||
pandas,core/reshape/melt.pyi,,melt
|
||||
pandas,core/reshape/merge.pyi,,merge
|
||||
pandas,core/reshape/pivot.pyi,,pivot
|
||||
pandas,io/excel/_base.pyi,,read_excel
|
||||
pandas,io/parsers.pyi,,read_csv
|
||||
pandas,io/parsers.pyi,,read_table
|
||||
pandas,io/stata.pyi,,read_stata
|
|
@ -0,0 +1,31 @@
|
|||
Docify is a utility that can insert docstrings into Python type stub files.
|
||||
We use this because we sometimes want to put docstrings for functions into
|
||||
the stubs, because it is hard to extract them from source. This is primarily
|
||||
used for some pandas APIs for now.
|
||||
|
||||
To install:
|
||||
|
||||
PBR_VERSION=1.0.0 pip install .
|
||||
|
||||
|
||||
Usage:
|
||||
docify [--verbose] <configfile> <stubpath>
|
||||
docify -h | --help
|
||||
docify --version
|
||||
|
||||
|
||||
The config file is a CSV file that has lines of the form:
|
||||
|
||||
package,stub_file_path,classname,methodname
|
||||
|
||||
If classname is empty, methodname is a top-level function name.
|
||||
|
||||
The stub file specified by stub_file_path will be patched with
|
||||
the docstring of classname.methodname, which will be extracted
|
||||
by introspection from the specified package.
|
||||
|
||||
For example:
|
||||
|
||||
pandas,./core/series.pyi,Series,groupby
|
||||
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
from .docify import docify
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
"""
|
||||
Docify.
|
||||
|
||||
Usage:
|
||||
docify [--verbose] <configfile> [<stubpath>]
|
||||
docify -h | --help
|
||||
docify --version
|
||||
|
||||
Options:
|
||||
--verbose Print out details of what docify is doing.
|
||||
-h --help Show this screen.
|
||||
--version Show version.
|
||||
|
||||
If stubpath is not specified the current working directory
|
||||
will be assumed.
|
||||
|
||||
The config file is a CSV file that has lines of the form:
|
||||
|
||||
package,stub_file_path,classname,methodname
|
||||
|
||||
If classname is empty, methodname is a top-level function name.
|
||||
The stub_file_path paths should be relative to stubpath.
|
||||
|
||||
The stub file specified by stub_file_path will be patched with
|
||||
the docstring of classname.methodname, which will be extracted
|
||||
by introspection from the specified package.
|
||||
|
||||
For example:
|
||||
|
||||
pandas,core/series.pyi,Series,groupby
|
||||
|
||||
"""
|
||||
import csv
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
import sys
|
||||
from docopt import docopt
|
||||
import docify
|
||||
|
||||
|
||||
def main():
|
||||
arguments = docopt(__doc__, version='docify 0.1')
|
||||
configfile = arguments['<configfile>']
|
||||
stubpath = arguments['<stubpath>']
|
||||
if stubpath is None:
|
||||
stubpath = '.'
|
||||
verbose = arguments['--verbose']
|
||||
with open(configfile) as f:
|
||||
patches = csv.reader(f)
|
||||
|
||||
for patch in patches:
|
||||
if not patch:
|
||||
break
|
||||
pkg, stub, class_, method = patch
|
||||
try:
|
||||
if pkg in sys.modules:
|
||||
package = sys.modules[pkg]
|
||||
else:
|
||||
package = importlib.import_module(pkg)
|
||||
|
||||
obj = package
|
||||
|
||||
for path in stub[:-4].split('/'):
|
||||
obj = obj.__dict__[path]
|
||||
|
||||
if class_:
|
||||
obj = obj.__dict__[class_]
|
||||
obj = obj.__dict__[method]
|
||||
doc = inspect.getdoc(obj)
|
||||
|
||||
except Exception as e:
|
||||
if class_ is None:
|
||||
print(f'Could not get docstring for {pkg}.{method}: {e}')
|
||||
else:
|
||||
print(f'Could not get docstring for {pkg}.{class_}.{method}: {e}')
|
||||
sys.exit(-1)
|
||||
|
||||
if not doc:
|
||||
if class_ is None:
|
||||
print(f'{pkg}.{method} has no docstring')
|
||||
else:
|
||||
print(f'{pkg}.{class_}.{method} has no docstring')
|
||||
sys.exit(-1)
|
||||
|
||||
try:
|
||||
docify.docify(os.path.join(stubpath, stub), class_, method, doc, verbose)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
sys.exit(-1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
sys.exit(0)
|
|
@ -0,0 +1,73 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def docify(stubfile, class_, method, doc, verbose):
|
||||
if not os.path.exists(stubfile):
|
||||
raise Exception(f'Missing stub file {stubfile}')
|
||||
with open(stubfile) as f:
|
||||
stublines = f.readlines()
|
||||
|
||||
lookfor = f'class {class_}' if class_ else f'def {method}('
|
||||
i = 0
|
||||
in_doc = False
|
||||
while i < len(stublines):
|
||||
line = stublines[i]
|
||||
|
||||
# Look for docstring start/end, as we want to skip docstrings
|
||||
# as they can mess with the parser. We use a simple check
|
||||
# that relies on """ being on a line on its own, which is
|
||||
# true of the docstrings we add at least.
|
||||
|
||||
if line.strip() == '"""':
|
||||
in_doc = not in_doc
|
||||
|
||||
elif not in_doc and line.startswith(lookfor):
|
||||
# If we are looking for a method, we have found the
|
||||
# class but need to keep going...
|
||||
if class_:
|
||||
i += 1
|
||||
lookfor = f'def {method}('
|
||||
while i < len(stublines):
|
||||
line = stublines[i].strip()
|
||||
if line == '"""':
|
||||
in_doc = not in_doc
|
||||
elif not in_doc:
|
||||
if line.startswith('class '):
|
||||
raise Exception(f'{stubfile}:{i} Method {method} not found in class {class_}')
|
||||
if line.startswith(lookfor):
|
||||
break
|
||||
i += 1
|
||||
else: # We're all out of lines and didn't find the method
|
||||
raise Exception(f'{stubfile}:{i} Method {method} not found in class {class_}')
|
||||
|
||||
# We have found the first line of the method. The signature can
|
||||
# span multiple lines, so we need to look for '...\n' to find the end
|
||||
j = i
|
||||
end = i
|
||||
while j < len(stublines):
|
||||
line = stublines[j].rstrip()
|
||||
if line.endswith(' ...'):
|
||||
end = j
|
||||
break
|
||||
j += 1
|
||||
else:
|
||||
raise Exception(f'Could not find end of method {method}')
|
||||
break
|
||||
|
||||
i += 1
|
||||
else:
|
||||
raise Exception(f'{stubfile}: Could not find target {class_} {method}')
|
||||
|
||||
# We now have the start and end of the method. Discard the " ..."
|
||||
# and add the docstring and a 'pass'.
|
||||
if class_:
|
||||
stublines[end] = f'{line[:-4]}\n """\n{doc}\n """\n pass\n'
|
||||
else:
|
||||
stublines[end] = f'{line[:-4]}\n """\n{doc}\n """\n pass\n'
|
||||
|
||||
with open(stubfile, 'w') as f:
|
||||
f.writelines(stublines)
|
||||
|
||||
print(f'Patched {class_} {method} in {stubfile}')
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
[metadata]
|
||||
name = docify
|
||||
author = Microsoft
|
||||
license = MIT
|
||||
long_description = file: README.md
|
||||
requires-python = >= 3.8
|
|
@ -0,0 +1,15 @@
|
|||
import setuptools
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
entry_points = {
|
||||
'console_scripts': ['docify=docify.cli:main'],
|
||||
},
|
||||
setup_requires=['pbr'],
|
||||
tests_require=['pytest', 'PyHamcrest'],
|
||||
install_requires=[
|
||||
'docopt',
|
||||
],
|
||||
pbr=True
|
||||
)
|
||||
|
Загрузка…
Ссылка в новой задаче