Source code for absfuyu.extra.da.dadf

"""
Absfuyu: Data Analysis
----------------------
Data Analyst DataFrame

Version: 5.1.0
Date updated: 10/03/2025 (dd/mm/yyyy)
"""

# Module level
# ---------------------------------------------------------------------------
__all__ = [
    "DADF",
    "DataAnalystDataFrameColumnMethodMixin",
    "DataAnalystDataFrameRowMethodMixin",
    "DataAnalystDataFrameInfoMixin",
    "DataAnalystDataFrameNAMixin",
    "DataAnalystDataFrameOtherMixin",
    "DataAnalystDataFrameDateMixin",
    "DataAnalystDataFrameCityMixin",
]


# Library
# ---------------------------------------------------------------------------
import random
import string
from collections.abc import Callable, Sequence
from datetime import datetime, timedelta
from typing import Any, Literal, Self

import numpy as np
import pandas as pd

try:
    from typing import override  # type: ignore
except ImportError:
    from absfuyu.core.decorator import dummy_decorator as override

from absfuyu.core.baseclass import ShowAllMethodsMixin
from absfuyu.core.docstring import deprecated, versionadded
from absfuyu.core.typings import _R, _T
from absfuyu.extra.da.dadf_base import CityData
from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
from absfuyu.extra.da.dadf_base import SplittedDF
from absfuyu.logger import logger
from absfuyu.util import set_min_max


# Column method
# ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameColumnMethodMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Column method - Rearrange rightmost column - Drop columns - Drop rightmost column - Add blank column """
[docs] def rearrange_rightmost_column( self, insert_to_col: str, num_of_cols: int = 1 ) -> Self: """ Move right-most columns to selected position Parameters ---------- insert_to_col : str Name of the column that the right-most column will be moved next to num_of_cols : int Number of columns moved, by default ``1`` Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 -1.583590 756 700 NaN eqklyckc 2023-05-20 1 0.203968 167 100 NaN wzrsxinb 2011-02-27 >>> df.rearrange_rightmost_column("number") number date number_big number_range missing_value text 0 -1.583590 2023-05-20 756 700 NaN eqklyckc 1 0.203968 2011-02-27 167 100 NaN wzrsxinb """ cols: list[str] = self.columns.to_list() # List of columns num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols))) col_index: int = cols.index(insert_to_col) new_cols: list[str] = ( cols[: col_index + 1] + cols[-num_of_cols:] + cols[col_index + 1 : len(cols) - num_of_cols] ) self = self.__class__(self[new_cols]) return self
[docs] def drop_columns(self, columns: Sequence[str]) -> Self: """ Drop columns in DataFrame Parameters ---------- columns : Iterable[str] List of columns need to drop Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 -0.283019 666 600 NaN ztoeeblx 2022-11-13 1 1.194725 939 900 NaN fxardqvh 2005-08-04 >>> df.drop_columns(["date", "text"]) number number_big number_range missing_value 0 -0.283019 666 600 NaN 1 1.194725 939 900 NaN """ for column in columns: try: self.drop(columns=[column], inplace=True) except KeyError: logger.debug(f"{column} column does not exist") # pass return self
[docs] def drop_rightmost(self, num_of_cols: int = 1) -> Self: """ Drop ``num_of_cols`` right-most columns Parameters ---------- num_of_cols : int Number of columns to drop Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 0.851953 572 500 5 ncpbnzef 2020-08-15 1 0.381643 595 500 53 iojogbgj 2011-12-04 >>> df.drop_rightmost(5) number 0 0.851953 1 0.381643 """ # Restrain # if num_of_cols < 1: # num_of_cols = 1 # if num_of_cols > self.shape[1]: # num_of_cols = self.shape[1] num_of_cols = int( set_min_max(num_of_cols, min_value=1, max_value=self.shape[1]) ) # Logic for _ in range(num_of_cols): self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True) return self
[docs] @deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead") def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self: """ Add a blank column Parameters ---------- column_name : str Name of the column to add fill : Any Fill the column with data Returns ------- Self Modified DataFrame """ self[column_name] = [fill] * self.shape[0] return self
# Row method # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameRowMethodMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Row method - Get different rows """
[docs] @versionadded("4.0.0") def get_different_rows(self, other: Self | pd.DataFrame) -> Self: """ Subtract DataFrame to find the different rows Parameters ---------- other : Self | pd.DataFrame DataFrame to subtract Returns ------- Self Different row DataFrame Example: -------- >>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) >>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]}) >>> df1.get_different_rows(df2) A B 0 1 7 2 3 6 """ df = self.copy() out = ( df.merge(other, indicator=True, how="right") .query("_merge=='right_only'") .drop("_merge", axis=1) ) return self.__class__(out)
# Info # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameInfoMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Info - Quick info - Quick describe - Show distribution - Threshold filter """ # Quick info
[docs] @versionadded("3.2.0") def qinfo(self) -> str: """ Show quick infomation about DataFrame Example: -------- >>> DADF.sample_df().qinfo() Dataset Information: - Number of Rows: 100 - Number of Columns: 6 - Total observation: 600 - Missing value: 13 (2.17%) Column names: ['number', 'number_big', 'number_range', 'missing_value', 'text', 'date'] """ missing_values = self.isnull().sum().sum() total_observation = self.shape[0] * self.shape[1] mv_rate = missing_values / total_observation * 100 info = ( f"Dataset Information:\n" f"- Number of Rows: {self.shape[0]:,}\n" f"- Number of Columns: {self.shape[1]:,}\n" f"- Total observation: {total_observation:,}\n" f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n" f"Column names:\n{self.columns.to_list()}" ) return info
[docs] @override def describe(self, percentiles=None, include=None, exclude=None) -> Self: """pd.DataFrame.describe() override""" return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
# Quick describe
[docs] @versionadded("3.2.0") def qdescribe(self) -> Self: """ Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype Returns ------- Self Modified DataFrame Example: -------- >>> DADF.sample_df().qdescribe() number number_big missing_value count 100.000000 100.000000 48.000000 mean -0.052935 586.750000 22.916667 std 0.954170 237.248596 11.987286 min -2.392952 105.000000 3.000000 25% -0.738311 407.500000 13.000000 50% -0.068014 607.000000 23.500000 75% 0.614025 790.250000 36.000000 max 2.512533 988.000000 42.000000 """ return self.__class__( # type: ignore [no-any-return] self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe() )
[docs] @versionadded("3.2.0") def show_distribution( self, column_name: str, dropna: bool = True, *, show_percentage: bool = True, percentage_round_up: int = 2, ) -> Self: """ Show distribution of a column Parameters ---------- column_name : str Column to show distribution dropna : bool Count N/A when ``False`` (Default: ``True``) show_percentage : bool Show proportion in range 0% - 100% instead of [0, 1] (Default: ``True``) percentage_round_up : int Round up to which decimals (Default: ``2``) Returns ------- Self Distribution DataFrame Example: -------- >>> DADF.sample_df().show_distribution("number_range") number_range count percentage 0 900 16 16.0 1 700 15 15.0 2 300 12 12.0 3 200 12 12.0 4 400 11 11.0 5 600 11 11.0 6 800 10 10.0 7 100 9 9.0 8 500 4 4.0 """ out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index() if show_percentage: out["percentage"] = (out["count"] / self.shape[0] * 100).round( percentage_round_up ) else: out["percentage"] = (out["count"] / self.shape[0]).round( percentage_round_up ) return self.__class__(out)
[docs] @deprecated("5.1.0", reason="Rework THIS") def threshold_filter( self, destination_column: str, threshold: int | float = 10, *, top: int | None = None, replace_with: Any = "Other", ) -> Self: """ Filter out percentage of data that smaller than the ``threshold``, replace all of the smaller data to ``replace_with``. As a result, pie chart is less messy. Parameters ---------- destination_column : str Column to be filtered threshold : int | float Which percentage to cut-off (Default: 10%) top : int Only show top ``x`` categories in pie chart (replace threshold mode) (Default: ``None``) replace_with : Any Replace all of the smaller data with specified value Returns ------- Self Modified DataFrame """ # Clean try: self[destination_column] = self[ destination_column ].str.strip() # Remove trailing space except Exception: pass # Logic col_df = self.show_distribution(destination_column) # Rename if top is not None: list_of_keep: list = ( col_df[destination_column] .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0])) .to_list() ) # logger.debug(list_of_keep) else: list_of_keep = col_df[col_df["percentage"] >= threshold][ destination_column ].to_list() # values that will not be renamed self[f"{destination_column}_filtered"] = self[destination_column].apply( lambda x: replace_with if x not in list_of_keep else x ) # Return return self
# Missing value # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameNAMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Missing value - Fill missing values - Get missing values - Split N/A - Apply not null - Apply not null row """
[docs] def fill_missing_values( self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan ) -> Self: """ Fill missing values in specified column Parameters ---------- column_name : str Column name fill : Any Fill the missing values with, by default ``np.nan`` fill_when_not_exist : Any When ``column_name`` does not exist, create a new column and fill with ``fill_when_not_exist``, by default ``np.nan`` Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 0.174303 926 900 NaN tenkiakh 2006-09-08 1 0.305137 140 100 NaN jzuddamf 2012-04-04 >>> df.fill_missing_values("missing_value", 0) number number_big number_range missing_value text date 0 0.174303 926 900 0.0 tenkiakh 2006-09-08 1 0.305137 140 100 0.0 jzuddamf 2012-04-04 >>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0) number number_big number_range missing_value text date missing_column 0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0 1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0 """ try: self[column_name] = self[column_name].fillna(fill) except KeyError: if getattr(self, "add_blank_column", None) is not None: # Compatible with DataAnalystDataFrameColumnMethodMixin self.add_blank_column(column_name, fill_when_not_exist) return self
[docs] def get_missing_values( self, hightlight: bool = True, *, percentage_round_up: int = 2 ) -> Self: """ Get a DataFrame contains count of missing values for each column Parameters ---------- hightlight : bool Shows only columns with missing values when ``True``, by default ``True`` percentage_round_up : int Round up to which decimals, by default ``2`` Returns ------- Self Missing value DataFrame Example: -------- >>> DADF.sample_df(152).get_missing_values() Num of N/A Percentage missing_value 42 27.63 """ # Check for missing value df_na = self.isnull().sum().sort_values(ascending=False) if hightlight: out = df_na[df_na != 0].to_frame() else: out = df_na.to_frame() out.rename(columns={0: "Num of N/A"}, inplace=True) out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round( percentage_round_up ) # logger.debug( # f"Percentage of N/A over entire DF: " # f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%" # ) return self.__class__(out)
[docs] @versionadded("3.1.0") def split_na(self, by_column: str) -> SplittedDF: """ Split DataFrame into 2 parts: - Without missing value in specified column - With missing value in specified column Parameters ---------- by_column : str Split by column Returns ------- SplittedDF Splitted DataFrame Example: -------- >>> DADF.sample_df(10).split_na("missing_value") SplittedDF( df= number number_big number_range missing_value text date 0 0.643254 690 600 3.0 cinvofwj 2018-08-15 2 0.499345 255 200 13.0 jasifzez 2005-06-01 3 -1.727036 804 800 38.0 esxjmger 2009-07-24 4 0.873058 690 600 32.0 htewfpld 2022-07-22 5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25 8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11 9 0.813655 137 100 20.0 oecttada 2024-11-22, df_na= number number_big number_range missing_value text date 1 -0.411354 363 300 NaN juzecani 2014-12-02 6 -0.833857 531 500 NaN ybnntryh 2023-11-03 7 1.355589 472 400 NaN zjltghjr 2024-10-09 ) """ out = SplittedDF( # df=self[~self[by_column].isna()], # DF df=self[self[by_column].notna()], # DF df_na=self[self[by_column].isna()], # DF w/o NA ) return out
[docs] @versionadded("5.1.0") def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self: """ Only apply callable to not NaN value in column Parameters ---------- col : str Column to apply callable : Callable[[Any], _R] Callable Returns ------- Self Applied DataFrame Example: -------- >>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED") number number_big number_range missing_value text date 0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08 1 1.522428 616 600 NaN mivkaooe 2018-12-27 2 0.108506 745 700 REPLACED qanwwjet 2005-07-14 3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20 4 0.118993 861 800 REPLACED saoupuby 2019-04-28 """ self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x) return self
[docs] @versionadded("5.1.0") # type: ignore def apply_notnull_row( self, apply_when_null: Callable[[Any], _R] | _T | None = None, apply_when_not_null: Callable[[Any], _R] | _T | None = None, col_name: str | None = None, ) -> Self: """ Apply to DataFrame's row with missing value. Parameters ---------- apply_when_null : Callable[[Any], R] | T | None, optional Callable or Any, by default ``None``: returns if entire row is not null apply_when_not_null : Callable[[Any], R] | T | None, optional Callable or Any, by default ``None``: returns if entire row is not null col_name : str | None, optional Output column name, by default ``None`` (uses custom name) Returns ------- Self Modified DataDrame Example: -------- >>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]}) >>> df.apply_notnull_row() A B C applied_row_null 0 NaN 1.0 NaN False 1 2.0 NaN 2.0 False 2 3.0 3.0 NaN False 3 4.0 4.0 4.0 True >>> df.apply_notnull_row(0, 1) A B C applied_row_null 0 NaN 1.0 NaN 0 1 2.0 NaN 2.0 0 2 3.0 3.0 NaN 0 3 4.0 4.0 4.0 1 >>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod") A B C mod 0 NaN 1.0 NaN n 1 2.0 NaN 2.0 n 2 3.0 3.0 NaN n 3 4.0 4.0 4.0 y """ def apply_func(row: pd.Series): # Both None if apply_when_null is None and apply_when_not_null is None: return row.notnull().all() # When all values in row are not null if row.notnull().all(): if callable(apply_when_not_null): return apply_when_not_null(row) return apply_when_not_null # When any value in row is null if callable(apply_when_null): return apply_when_null(row) return apply_when_null # Column name cname = "applied_row_null" if col_name is None else col_name self[cname] = self.apply(apply_func, axis=1) return self
# Other # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameOtherMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Other method/Stuff - Merge left """
[docs] @versionadded("4.0.0") def merge_left( self, other: Self | pd.DataFrame, on: str, columns: list[str] | None = None, ) -> Self: """ Merge left of 2 DataFrame Parameters ---------- other : Self | pd.DataFrame DataFrame to merge on : str Merge on which column columns : list[str] | None, optional Columns to take from other DataFrame, by default ``None`` (Take all columns) Returns ------- Self Merged DataFrame Example: -------- >>> df1 = DADF({ ... "id": [1, 2, 5], ... "name": ["Alice", "Bob", "Rich"], ... "age": [20, 20, 20], ... }) >>> df2 = DADF({ ... "id": [1, 2, 3], ... "age": [25, 30, 45], ... "department": ["HR", "IT", "PM"], ... "salary": [50000, 60000, 55000], ... }) >>> df1.merge_left(df2, on="id") id name age_x age_y department salary 0 1 Alice 20 25.0 HR 50000.0 1 2 Bob 20 30.0 IT 60000.0 2 5 Rich 20 NaN NaN NaN >>> df1.merge_left(df2, on="id", columns=["salary"]) id name age department salary 0 1 Alice 25.0 HR 50000.0 1 2 Bob 30.0 IT 60000.0 2 5 Rich NaN NaN NaN """ if columns is not None: current_col = [on] current_col.extend(columns) col = other.columns.to_list() cols = list(set(col) - set(current_col)) if getattr(self, "drop_columns", None) is not None: # Compatible with DataAnalystDataFrameColumnMethodMixin self.drop_columns(cols) out = self.merge(other, how="left", on=on) return self.__class__(out)
# Date # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameDateMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - Date - Add date column from month column - Add detail date - Delta date (How many days inbetween) """
[docs] def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self: """ Add dummy ``date`` column from ``month`` column Parameters ---------- month_column : str Month column col_name : str New date column name, by default: ``"date"`` Returns ------- Self Modified DataFrame Example: -------- >>> df = ( ... DADF.sample_df(2) ... .add_detail_date("date", mode="m") ... .drop_columns(["date", "number", "number_range"]) ... ) >>> df number_big missing_value text month 0 755 NaN lincgqzl 4 1 907 NaN gxltrjku 10 >>> df.add_date_from_month("month") number_big missing_value text month date 0 755 NaN lincgqzl 4 2025-04-01 1 907 NaN gxltrjku 10 2025-10-01 """ _this_year = datetime.now().year self[col_name] = pd.to_datetime( f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1", format="%Y-%m-%d", ) # Rearrange if getattr(self, "rearrange_rightmost_column", None) is not None: # Compatible with DataAnalystDataFrameColumnMethodMixin return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return] return self
[docs] def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self: """ Add these columns from ``date_column``: - ``date`` (won't add if ``date_column`` value is ``"date"``) - ``day`` (overwrite if already exist) - ``week`` (overwrite if already exist) - ``month`` (overwrite if already exist) - ``year`` (overwrite if already exist) Parameters ---------- date_column : str Date column mode : str | Detailed column to add | ``d``: day | ``w``: week number | ``m``: month | ``y``: year | (Default: ``"dwmy"``) Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 0.331195 902 900 20 fgyanxik 2021-10-18 1 -0.877727 378 300 13 dqvaggjo 2007-03-06 >>> df.add_detail_date("date") number number_big number_range missing_value text date day week month year 0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021 1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007 """ # Convert to datetime self["date"] = pd.to_datetime(self[date_column]) # Logic col_counter = 0 # self["weekday"] = self["day"].dt.isocalendar().day # Weekday if mode.find("d") != -1: logger.debug("Mode: 'day'") self["day"] = self["date"].dt.day col_counter += 1 if mode.find("w") != -1: logger.debug("Mode: 'weekday'") self["week"] = self["date"].dt.isocalendar().week col_counter += 1 if mode.find("m") != -1: logger.debug("Mode: 'month'") self["month"] = self["date"].dt.month col_counter += 1 if mode.find("y") != -1: logger.debug("Mode: 'year'") self["year"] = self["date"].dt.year col_counter += 1 # Return if getattr(self, "rearrange_rightmost_column", None) is not None: # Compatible with DataAnalystDataFrameColumnMethodMixin return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return] return self
[docs] def delta_date( self, date_column: str, mode: Literal["now", "between_row"] = "now", *, col_name: str = "delta_date", ) -> Self: """ Calculate date interval Parameters ---------- date_column : str Date column mode : str | Mode to calculate | ``"between_row"``: Calculate date interval between each row | ``"now"``: Calculate date interval to current date | (Default: ``"now"``) col_name : str | New delta date column name | (Default: ``"delta_date"``) Returns ------- Self Modified DataFrame Example: -------- >>> df = DADF.sample_df(2) >>> df number number_big number_range missing_value text date 0 -0.729988 435 400 21 xkrqqouf 2014-08-01 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 >>> df.delta_date("date") number number_big number_range missing_value text date delta_date 0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242 """ if mode.lower().startswith("between_row"): dated = self[date_column].to_list() cal: list[timedelta] = [] for i in range(len(dated)): if i == 0: cal.append(dated[i] - dated[i]) # cal.append(relativedelta(dated[i], dated[i])) else: cal.append(dated[i] - dated[i - 1]) # cal.append(relativedelta(dated[i], dated[i - 1])) self[col_name] = [x.days for x in cal] else: # mode="now" self[col_name] = self[date_column].apply( lambda x: (datetime.now() - x).days ) return self
# City # ---------------------------------------------------------------------------
[docs] class DataAnalystDataFrameCityMixin(DFBase): """ Data Analyst ``pd.DataFrame`` - City - Convert city """
[docs] def convert_city( self, city_column: str, city_list: list[CityData], *, mode: str = "ra", ) -> Self: """ Get ``region`` and ``area`` of a city Parameters ---------- city_column : str Column contains city data city_list : list[CityData] List of city in correct format (Default: ``None``) mode : str | Detailed column to add | ``r``: region | ``a``: area | (Default: ``"ra"``) Returns ------- DataAnalystDataFrame Modified DataFrame """ # Support function def _convert_city_support(value: str) -> CityData: for x in city_list: if x.city.lower().startswith(value.lower()): return x return CityData(city=value, region=np.nan, area=np.nan) # type: ignore # Convert col_counter = 0 if mode.find("r") != -1: logger.debug("Mode: 'region'") self["region"] = self[city_column].apply( lambda x: _convert_city_support(x).region ) col_counter += 1 if mode.find("a") != -1: logger.debug("Mode: 'area'") self["area"] = self[city_column].apply( lambda x: _convert_city_support(x).area ) col_counter += 1 # Rearrange if getattr(self, "rearrange_rightmost_column", None) is not None: return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return] return self
# Main # ---------------------------------------------------------------------------
[docs] class DADF( ShowAllMethodsMixin, DataAnalystDataFrameCityMixin, DataAnalystDataFrameDateMixin, DataAnalystDataFrameOtherMixin, DataAnalystDataFrameNAMixin, DataAnalystDataFrameInfoMixin, DataAnalystDataFrameRowMethodMixin, DataAnalystDataFrameColumnMethodMixin, ): """ Data Analyst ``pd.DataFrame`` For a list of extra methods: >>> print(DADF.DADF_METHODS) """
[docs] @classmethod @deprecated("5.1.0") @versionadded("3.2.0") def dadf_help(cls) -> list[str]: """ Show all available method of DataAnalystDataFrame """ list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame))) return sorted(list_of_method)
[docs] @classmethod def sample_df(cls, size: int = 100) -> Self: """ Create sample DataFrame Parameters ---------- size : int Number of observations, by default ``100`` Returns ------- Self DataFrame with these columns: [number, number_big, number_range, missing_value, text, date] Example: -------- >>> DataAnalystDataFrame.sample_df() number number_big number_range missing_value text date 0 -2.089770 785 700 NaN vwnlqoql 2013-11-20 1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13 2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24 3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28 4 1.687803 878 800 NaN aajtncum 2005-10-05 .. ... ... ... ... ... ... 95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12 96 1.296795 255 200 NaN lwvytego 2014-05-10 97 1.440746 297 200 5.0 lqsoykun 2010-04-03 98 0.327702 845 800 NaN leadkvsy 2005-08-05 99 0.556720 981 900 36.0 bozmxixy 2004-02-22 [100 rows x 6 columns] """ # Restrain size = max(size, 1) # Number col df = cls(np.random.randn(size, 1), columns=["number"]) df["number_big"] = [ random.choice(range(100, 999)) for _ in range(size) ] # Big number in range 100-999 df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00") # Missing value col na_rate = random.randint(1, 99) d = [random.randint(1, 99) for _ in range(size)] df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d)) # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)] # Text col df["text"] = [ "".join([random.choice(string.ascii_lowercase) for _ in range(8)]) for _ in range(size) ] # Random date col df["date"] = [ datetime( year=random.randint(datetime.now().year - 20, datetime.now().year), month=random.randint(1, 12), day=random.randint(1, 28), ) for _ in range(size) ] # Return return df
class DADF_WIP(DADF): """ W.I.P - No test cases written """ pass