"""
Absfuyu: Data Analysis
----------------------
Data Analyst DataFrame - Base/Core
Version: 5.1.0
Date updated: 10/03/2025 (dd/mm/yyyy)
"""
# Module level
# ---------------------------------------------------------------------------
__all__ = ["DataAnalystDataFrameBase", "SplittedDF", "CityData"]
# Library
# ---------------------------------------------------------------------------
import random
from collections import deque
from typing import ClassVar, Literal, NamedTuple
import pandas as pd
# Class
# ---------------------------------------------------------------------------
[docs]
class DataAnalystDataFrameBase(pd.DataFrame):
"""
Data Analyst ``pd.DataFrame`` - Base
Set class variable ``_DADF_INCLUDE`` to ``False`` to exclude from ``DADF_METHODS``
"""
# Custom attribute
_DADF_INCLUDE: ClassVar[bool] = True # Include in DADF_METHODS
DADF_METHODS: ClassVar[dict[str, list[str]]] = {}
def __init_subclass__(cls, *args, **kwargs) -> None:
"""
This create a dictionary with:
- key (str) : Subclass
- value (list[str]): List of available methods
"""
super().__init_subclass__(*args, **kwargs)
if cls._DADF_INCLUDE and not any(
[x.endswith(cls.__name__) for x in cls.DADF_METHODS.keys()]
):
# if not any([x.endswith(cls.__name__) for x in cls.DADF_METHODS.keys()]):
methods_list: list[str] = [
k for k, v in cls.__dict__.items() if callable(v)
]
if len(methods_list) > 0:
name = f"{cls.__module__}.{cls.__name__}"
cls.DADF_METHODS.update({name: sorted(methods_list)})
[docs]
class SplittedDF(NamedTuple):
"""
DataFrame splitted into contains
missing values only and vice versa
Parameters
----------
df : DataFrame
DataFrame without missing values
df_na : DataFrame
DataFrame with missing values only
"""
df: pd.DataFrame
df_na: pd.DataFrame
[docs]
@staticmethod
def concat_df(
df_list: list[pd.DataFrame], join: Literal["inner", "outer"] = "inner"
) -> pd.DataFrame:
"""
Concat the list of DataFrame (static method)
Parameters
----------
df_list : list[DataFrame]
A sequence of DataFrame
join : str
Join type
(Default: ``"inner"``)
Returns
-------
DataFrame
Joined DataFrame
"""
df: pd.DataFrame = pd.concat(df_list, axis=0, join=join).reset_index()
df.drop(columns=["index"], inplace=True)
return df
[docs]
def concat(self, join: Literal["inner", "outer"] = "inner") -> pd.DataFrame:
"""
Concat the splitted DataFrame
Parameters
----------
join : str
Join type
(Default: ``"inner"``)
Returns
-------
DataFrame
Joined DataFrame
"""
return self.concat_df(self, join=join) # type: ignore
[docs]
@staticmethod
def divide_dataframe(df: pd.DataFrame, by_column: str) -> list[pd.DataFrame]:
"""
Divide DataFrame into a list of DataFrame
Parameters
----------
df : DataFrame
DataFrame
by_column : str
By which column
Returns
-------
list[DataFrame]
Splitted DataFrame
"""
divided = [x for _, x in df.groupby(by_column)]
return divided
[docs]
class CityData(NamedTuple):
"""
Parameters
----------
city : str
City name
region : str
Region of the city
area : str
Area of the region
"""
city: str
region: str
area: str
@staticmethod
def _sample_city_data(size: int = 100):
"""
Generate sample city data (testing purpose)
"""
sample_range = 10 ** len(str(size))
# Serial list
serials: list[str] = []
while len(serials) != size: # Unique serial
serial = random.randint(0, sample_range - 1)
serial = str(serial).rjust(len(str(size)), "0") # type: ignore
if serial not in serials: # type: ignore
serials.append(serial) # type: ignore
ss2 = deque(serials[: int(len(serials) / 2)]) # Cut half for region
ss2.rotate(random.randrange(1, 5))
[ss2.extend(ss2) for _ in range(2)] # type: ignore # Extend back
ss3 = deque(serials[: int(len(serials) / 4)]) # Cut forth for area
ss3.rotate(random.randrange(1, 5))
[ss3.extend(ss3) for _ in range(4)] # type: ignore # Extend back
serials = ["city_" + x for x in serials]
ss2 = ["region_" + x for x in ss2] # type: ignore
ss3 = ["area_" + x for x in ss3] # type: ignore
ss = list(zip(serials, ss2, ss3)) # Zip back
out = list(map(CityData._make, ss))
return out