当前位置：首页 > news >正文

网站开发文档范例宁晋网站建设

news 2026/4/27 13:43:47

网站开发文档范例,宁晋网站建设,wordpress new2主题使用,品牌设计有哪些东西73_Pandas获取分位数/百分位数使用 quantile() 方法获取 pandas 中 DataFrame 或 Series 的分位数/百分位数。目录 Quantile() 的基本用法指定要获取的分位数/百分位数#xff1a;参数 q指定interpolation方法#xff1a;参数interpolation 数据类型 dtype 的差异指定行…73_Pandas获取分位数/百分位数使用 quantile() 方法获取 pandas 中 DataFrame 或 Series 的分位数/百分位数。目录 Quantile() 的基本用法指定要获取的分位数/百分位数参数 q指定interpolation方法参数interpolation 数据类型 dtype 的差异指定行/列参数axis指定是否处理非数字值参数 numeric_only用于字符串上用于日期时间用于布尔值 bool 本文示例代码的pandas版本如下。请注意规格可能因版本而异。以下面的DataFrame为例。 import pandas as pdprint(pd.__version__) # 1.3.5df pd.DataFrame({col_1: range(11), col_2: [i**2 for i in range(11)]}) print(df) # col_1 col_2 # 0 0 0 # 1 1 1 # 2 2 4 # 3 3 9 # 4 4 16 # 5 5 25 # 6 6 36 # 7 7 49 # 8 8 64 # 9 9 81 # 10 10 100Quantile() 的基本用法默认情况下DataFrame 的 quantile() 将每列的中值1/2 分位数第 50 个百分位数返回为 Series。稍后将解释包含非数字列的情况。 print(df.quantile()) # col_1 5.0 # col_2 25.0 # Name: 0.5, dtype: float64print(type(df.quantile())) # class pandas.core.series.Series如果从系列中调用 quantile()中值将作为标量值返回。 print(df[col_1].quantile()) # 5.0print(type(df[col_1].quantile())) # class numpy.float64元素类型根据原始数据类型和下述interpolation参数的设置而不同。指定要获取的分位数/百分位数参数 q 指定想要在第一个参数 q 中获得的 0.0 到 1.0 之间的分位数/百分比。 print(df.quantile(0.2)) # col_1 2.0 # col_2 4.0 # Name: 0.2, dtype: float64列表中可以指定多种规格。在这种情况下返回值将是一个 DataFrame。 print(df.quantile([0, 0.25, 0.5, 0.75, 1.0])) # col_1 col_2 # 0.00 0.0 0.0 # 0.25 2.5 6.5 # 0.50 5.0 25.0 # 0.75 7.5 56.5 # 1.00 10.0 100.0print(type(df.quantile([0, 0.25, 0.5, 0.75, 1.0]))) # class pandas.core.frame.DataFrame如果指定多个Series则返回值将为Series。 print(df[col_1].quantile([0, 0.25, 0.5, 0.75, 1.0])) # 0.00 0.0 # 0.25 2.5 # 0.50 5.0 # 0.75 7.5 # 1.00 10.0 # Name: col_1, dtype: float64print(type(df[col_1].quantile([0, 0.25, 0.5, 0.75, 1.0]))) # class pandas.core.series.Series指定interpolation方法参数 interpolation 值interpolation方法由参数interpolation指定。默认值为“linear”. print(df.quantile(0.21)) # col_1 2.1 # col_2 4.5 # Name: 0.21, dtype: float64print(df.quantile(0.21, interpolationlinear)) # col_1 2.1 # col_2 4.5 # Name: 0.21, dtype: float64“lower”使用较小的值“higher”使用较大的值“nearest”使用最接近的值。 print(df.quantile(0.21, interpolationlower)) # col_1 2 # col_2 4 # Name: 0.21, dtype: int64print(df.quantile(0.21, interpolationhigher)) # col_1 3 # col_2 9 # Name: 0.21, dtype: int64print(df.quantile(0.21, interpolationnearest)) # col_1 2 # col_2 4 # Name: 0.21, dtype: int64“midpoint”是前一个值和后一个值之间的中间值平均值。 print(df.quantile(0.21, interpolationmidpoint)) # col_1 2.5 # col_2 6.5 # Name: 0.21, dtype: float64数据类型 dtype 的差异默认是线性interpolation因此如果原始数据类型dtype是整数int则会转换为浮点数float。请注意即使该值与原始值相同数据类型也会改变。 print(df.quantile(0.2)) # col_1 2.0 # col_2 4.0 # Name: 0.2, dtype: float64在“lower”、“higher”和“nearest”的情况下按原样使用原始值因此数据类型保持不变。 print(df.quantile(0.2, interpolationlower)) # col_1 2 # col_2 4 # Name: 0.2, dtype: int64指定行/列参数axis 默认是按列处理但如果 axis 参数设置为 1 或 ‘columns’则会按行处理。 print(df.quantile(axis1)) # 0 0.0 # 1 1.0 # 2 3.0 # 3 6.0 # 4 10.0 # 5 15.0 # 6 21.0 # 7 28.0 # 8 36.0 # 9 45.0 # 10 55.0 # Name: 0.5, dtype: float64指定是否处理非数字值参数 numeric_only 可以使用参数 numeric_only 指定是否处理非数字列。将 numeric_only 设置为 True 将仅定位数字列并将其设置为 False 将定位所有类型的列。从pandas 2.0开始numeric_only的默认值为False。在此之前确实如此。请注意这取决于版本。用于字符串上以添加了字符串列的 DataFrame 为例。 df_str df.copy() df_str[col_3] list(abcdefghijk) print(df_str) # col_1 col_2 col_3 # 0 0 0 a # 1 1 1 b # 2 2 4 c # 3 3 9 d # 4 4 16 e # 5 5 25 f # 6 6 36 g # 7 7 49 h # 8 8 64 i # 9 9 81 j # 10 10 100 kprint(df_str.dtypes) # col_1 int64 # col_2 int64 # col_3 object # dtype: object如果参数 numeric_only 设置为 True则仅以数字列为目标并且排除字符串列。 print(df_str.quantile(numeric_onlyTrue)) # col_1 5.0 # col_2 25.0 # Name: 0.5, dtype: float64当以参数 numeric_only 设置为 False从 pandas 2.0 开始默认的字符串列为目标时如果参数interpolation是“线性”默认或“中点”则会发生错误。对于“lower”、“higher”和“nearest”该值将是前一个值或根据字典顺序的前一个值。 # print(df_str.quantile()) # TypeError: unsupported operand type(s) for -: str and str# print(df_str.quantile(interpolationmidpoint)) # TypeError: unsupported operand type(s) for -: str and strprint(df_str.quantile([0.2, 0.21, 0.3], interpolationlower)) # col_1 col_2 col_3 # 0.20 2 4 c # 0.21 2 4 c # 0.30 3 9 dprint(df_str.quantile([0.2, 0.21, 0.3], interpolationhigher)) # col_1 col_2 col_3 # 0.20 2 4 c # 0.21 3 9 d # 0.30 3 9 dprint(df_str.quantile([0.2, 0.21, 0.3], interpolationnearest)) # col_1 col_2 col_3 # 0.20 2 4 c # 0.21 2 4 c # 0.30 3 9 d用于日期时间以添加了日期时间列的 DataFrame 为例。 df_dt df.copy() df_dt[col_3] pd.date_range(2023-01-01, 2023-01-11) print(df_dt) # col_1 col_2 col_3 # 0 0 0 2023-01-01 # 1 1 1 2023-01-02 # 2 2 4 2023-01-03 # 3 3 9 2023-01-04 # 4 4 16 2023-01-05 # 5 5 25 2023-01-06 # 6 6 36 2023-01-07 # 7 7 49 2023-01-08 # 8 8 64 2023-01-09 # 9 9 81 2023-01-10 # 10 10 100 2023-01-11print(df_dt.dtypes) # col_1 int64 # col_2 int64 # col_3 datetime64[ns] # dtype: object如果参数 numeric_only 设置为 True则仅将数字列作为目标并且将排除日期和时间列。 print(df_dt.quantile(numeric_onlyTrue)) # col_1 5.0 # col_2 25.0 # Name: 0.5, dtype: float64即使interpolation参数是“linear”默认或“midpoint”日期和时间列也会正确interpolation。当然“lower”、“higher”和“nearest”也是可以接受的。 print(df_dt.quantile([0.2, 0.21, 0.3])) # col_1 col_2 col_3 # 0.20 2.0 4.0 2023-01-03 00:00:00 # 0.21 2.1 4.5 2023-01-03 02:24:00 # 0.30 3.0 9.0 2023-01-04 00:00:00print(df_dt.quantile([0.2, 0.21, 0.3], interpolationmidpoint)) # col_1 col_2 col_3 # 0.20 2.0 4.0 2023-01-03 00:00:00 # 0.21 2.5 6.5 2023-01-03 12:00:00 # 0.30 3.0 9.0 2023-01-04 00:00:00print(df_dt.quantile([0.2, 0.21, 0.3], interpolationlower)) # col_1 col_2 col_3 # 0.20 2 4 2023-01-03 # 0.21 2 4 2023-01-03 # 0.30 3 9 2023-01-04print(df_dt.quantile([0.2, 0.21, 0.3], interpolationhigher)) # col_1 col_2 col_3 # 0.20 2 4 2023-01-03 # 0.21 3 9 2023-01-04 # 0.30 3 9 2023-01-04print(df_dt.quantile([0.2, 0.21, 0.3], interpolationnearest)) # col_1 col_2 col_3 # 0.20 2 4 2023-01-03 # 0.21 2 4 2023-01-03 # 0.30 3 9 2023-01-04用于布尔值 bool 以添加了一列 boolean 布尔值的 DataFrame 为例。 df_bool df.copy() df_bool[col_3] [True, False, True, False, True, False, True, False, True, False, True] print(df_bool) # col_1 col_2 col_3 # 0 0 0 True # 1 1 1 False # 2 2 4 True # 3 3 9 False # 4 4 16 True # 5 5 25 False # 6 6 36 True # 7 7 49 False # 8 8 64 True # 9 9 81 False # 10 10 100 Trueprint(df_bool.dtypes) # col_1 int64 # col_2 int64 # col_3 bool # dtype: object可以使用 select_dtypes() 排除 bool 列也可以使用 astype() 将其转换为整数 int。 print(df_bool.select_dtypes(excludebool)) # col_1 col_2 # 0 0 0 # 1 1 1 # 2 2 4 # 3 3 9 # 4 4 16 # 5 5 25 # 6 6 36 # 7 7 49 # 8 8 64 # 9 9 81 # 10 10 100print(df_bool.select_dtypes(excludebool).quantile()) # col_1 5.0 # col_2 25.0 # Name: 0.5, dtype: float64print(df_bool.astype({col_3: int})) # col_1 col_2 col_3 # 0 0 0 1 # 1 1 1 0 # 2 2 4 1 # 3 3 9 0 # 4 4 16 1 # 5 5 25 0 # 6 6 36 1 # 7 7 49 0 # 8 8 64 1 # 9 9 81 0 # 10 10 100 1print(df_bool.astype({col_3: int}).quantile()) # col_1 5.0 # col_2 25.0 # col_3 1.0 # Name: 0.5, dtype: float64

查看全文

http://www.hkea.cn/news/14435531/