Skip to content

Pandas

Pandas plugin provides Schema and Fields models and converters between Pandas and Data Package notations

Installation

Extra dependency needs to be installed:

pip install dplib-py[pandas]

Usage

Converting a Pandas dataframe to the Data Package notation:

from dplib.plugins.pandas.models import PandasSchema

schema = PandasSchema(df=df).to_dp()
print(schema.to_text(format='json'))

Converting from Data Package notation to Pandas:

from dplib.models import Schema
from dplib.plugins.pandas.models import PandasSchema

schema = PandasSchema.from_dp(Schema.from_path('data/schema.json'))
print(schema.df)

Reference

dplib.plugins.pandas.models.PandasSchema

Bases: Model

Pandas Schema model

Source code in dplib/plugins/pandas/models/schema.py
class PandasSchema(Model, arbitrary_types_allowed=True):
    """Pandas Schema model"""

    df: pd.DataFrame

    # Getters

    def get_field_names(self) -> List[str]:
        """Get field names"""
        return list(self.df.columns)

    def get_field_types(self) -> List[Any]:
        """Get field types"""
        return list(self.df.dtypes)  # type: ignore

    # Converters

    def to_dp(self) -> Schema:
        """Convert to Table Schema

        Returns:
            Table Schema
        """
        schema = Schema()

        # Primary key
        for index, name in enumerate(self.df.index.names):  # type: ignore
            dtype = self.df.index.get_level_values(index).dtype  # type: ignore
            field = PandasField(name=name, dtype=dtype).to_dp()
            field.constraints.required = True
            schema.fields.append(field)
            schema.primaryKey.append(name)

        # Fields
        for name, dtype in self.df.dtypes.items():  # type: ignore
            dvalue: Any = self.df[name].iloc[0] if len(self.df) else None  # type: ignore
            field = PandasField(name=str(name), dtype=dtype, dvalue=dvalue).to_dp()
            schema.fields.append(field)

        return schema

    @classmethod
    def from_dp(cls, schema: Schema) -> PandasSchema:
        """Create Pandas Schema from Table Schema

        Parameters:
            schema: Table Schema

        Returns:
            Pandas Schema
        """
        columns: Dict[str, pd.Series[Any]] = {}

        # Fields
        for field in schema.fields:
            pandas_field = PandasField.from_dp(field)
            columns[pandas_field.name] = pd.Series(dtype=pandas_field.dtype)  # type: ignore

        # Primary key
        index = schema.primaryKey

        return PandasSchema(df=pd.DataFrame(columns, index=index))

df: pd.DataFrame instance-attribute

from_dp(schema) classmethod

Create Pandas Schema from Table Schema

Parameters:

Name Type Description Default
schema Schema

Table Schema

required

Returns:

Type Description
PandasSchema

Pandas Schema

Source code in dplib/plugins/pandas/models/schema.py
@classmethod
def from_dp(cls, schema: Schema) -> PandasSchema:
    """Create Pandas Schema from Table Schema

    Parameters:
        schema: Table Schema

    Returns:
        Pandas Schema
    """
    columns: Dict[str, pd.Series[Any]] = {}

    # Fields
    for field in schema.fields:
        pandas_field = PandasField.from_dp(field)
        columns[pandas_field.name] = pd.Series(dtype=pandas_field.dtype)  # type: ignore

    # Primary key
    index = schema.primaryKey

    return PandasSchema(df=pd.DataFrame(columns, index=index))

get_field_names()

Get field names

Source code in dplib/plugins/pandas/models/schema.py
def get_field_names(self) -> List[str]:
    """Get field names"""
    return list(self.df.columns)

get_field_types()

Get field types

Source code in dplib/plugins/pandas/models/schema.py
def get_field_types(self) -> List[Any]:
    """Get field types"""
    return list(self.df.dtypes)  # type: ignore

to_dp()

Convert to Table Schema

Returns:

Type Description
Schema

Table Schema

Source code in dplib/plugins/pandas/models/schema.py
def to_dp(self) -> Schema:
    """Convert to Table Schema

    Returns:
        Table Schema
    """
    schema = Schema()

    # Primary key
    for index, name in enumerate(self.df.index.names):  # type: ignore
        dtype = self.df.index.get_level_values(index).dtype  # type: ignore
        field = PandasField(name=name, dtype=dtype).to_dp()
        field.constraints.required = True
        schema.fields.append(field)
        schema.primaryKey.append(name)

    # Fields
    for name, dtype in self.df.dtypes.items():  # type: ignore
        dvalue: Any = self.df[name].iloc[0] if len(self.df) else None  # type: ignore
        field = PandasField(name=str(name), dtype=dtype, dvalue=dvalue).to_dp()
        schema.fields.append(field)

    return schema

dplib.plugins.pandas.models.PandasField

Bases: Model

Pandas Field model

Source code in dplib/plugins/pandas/models/field.py
class PandasField(Model, arbitrary_types_allowed=True):
    """Pandas Field model"""

    name: str
    dtype: Any
    dvalue: Optional[Any] = None

    # Converters

    def to_dp(self) -> models.IField:
        """Convert to Table Schema Field

        Returns:
            Table Schema Field
        """

        # Type
        Field = models.Field
        if pdc.is_bool_dtype(self.dtype):  # type: ignore
            Field = models.BooleanField
        elif pdc.is_datetime64_any_dtype(self.dtype):  # type: ignore
            Field = models.DatetimeField
        elif pdc.is_integer_dtype(self.dtype):  # type: ignore
            Field = models.IntegerField
        elif pdc.is_numeric_dtype(self.dtype):  # type: ignore
            Field = models.NumberField
        elif self.dvalue is not None:
            if isinstance(self.dvalue, (list, tuple)):  # type: ignore
                Field = models.ArrayField
            elif isinstance(self.dvalue, datetime.datetime):
                Field = models.DatetimeField
            elif isinstance(self.dvalue, datetime.date):
                Field = models.DateField
            elif isinstance(self.dvalue, isodate.Duration):  # type: ignore
                Field = models.DurationField
            elif isinstance(self.dvalue, dict):
                Field = models.ObjectField
            elif isinstance(self.dvalue, str):
                Field = models.StringField
            elif isinstance(self.dvalue, datetime.time):
                Field = models.TimeField

        # Name
        field = Field(name=self.name)

        return field

    @classmethod
    def from_dp(cls, field: models.IField) -> PandasField:
        """Create Pandas Field from Table Schema Field

        Parameters:
            field: Table Schema Field

        Returns:
            Pandas Field
        """
        if not field.name:
            raise Error(f"Field name is required to convert to pandas: {field}")

        # Type
        dtype = np.dtype("O")
        if field.type == "array":
            dtype = np.dtype(list)  # type: ignore
        elif field.type == "boolean":
            dtype = np.dtype(bool)
        elif field.type == "datetime":
            dtype = pd.DatetimeTZDtype(tz="UTC")
        elif field.type == "integer":
            dtype = np.dtype(int)
        elif field.type == "geojson":
            dtype = np.dtype(dict)
        elif field.type == "number":
            dtype = np.dtype(float)
        elif field.type == "object":
            dtype = np.dtype(dict)
        elif field.type == "string":
            dtype = np.dtype(str)
        elif field.type == "year":
            dtype = np.dtype(int)

        return PandasField(name=field.name, dtype=dtype)

dtype: Any instance-attribute

dvalue: Optional[Any] = None class-attribute instance-attribute

name: str instance-attribute

from_dp(field) classmethod

Create Pandas Field from Table Schema Field

Parameters:

Name Type Description Default
field IField

Table Schema Field

required

Returns:

Type Description
PandasField

Pandas Field

Source code in dplib/plugins/pandas/models/field.py
@classmethod
def from_dp(cls, field: models.IField) -> PandasField:
    """Create Pandas Field from Table Schema Field

    Parameters:
        field: Table Schema Field

    Returns:
        Pandas Field
    """
    if not field.name:
        raise Error(f"Field name is required to convert to pandas: {field}")

    # Type
    dtype = np.dtype("O")
    if field.type == "array":
        dtype = np.dtype(list)  # type: ignore
    elif field.type == "boolean":
        dtype = np.dtype(bool)
    elif field.type == "datetime":
        dtype = pd.DatetimeTZDtype(tz="UTC")
    elif field.type == "integer":
        dtype = np.dtype(int)
    elif field.type == "geojson":
        dtype = np.dtype(dict)
    elif field.type == "number":
        dtype = np.dtype(float)
    elif field.type == "object":
        dtype = np.dtype(dict)
    elif field.type == "string":
        dtype = np.dtype(str)
    elif field.type == "year":
        dtype = np.dtype(int)

    return PandasField(name=field.name, dtype=dtype)

to_dp()

Convert to Table Schema Field

Returns:

Type Description
IField

Table Schema Field

Source code in dplib/plugins/pandas/models/field.py
def to_dp(self) -> models.IField:
    """Convert to Table Schema Field

    Returns:
        Table Schema Field
    """

    # Type
    Field = models.Field
    if pdc.is_bool_dtype(self.dtype):  # type: ignore
        Field = models.BooleanField
    elif pdc.is_datetime64_any_dtype(self.dtype):  # type: ignore
        Field = models.DatetimeField
    elif pdc.is_integer_dtype(self.dtype):  # type: ignore
        Field = models.IntegerField
    elif pdc.is_numeric_dtype(self.dtype):  # type: ignore
        Field = models.NumberField
    elif self.dvalue is not None:
        if isinstance(self.dvalue, (list, tuple)):  # type: ignore
            Field = models.ArrayField
        elif isinstance(self.dvalue, datetime.datetime):
            Field = models.DatetimeField
        elif isinstance(self.dvalue, datetime.date):
            Field = models.DateField
        elif isinstance(self.dvalue, isodate.Duration):  # type: ignore
            Field = models.DurationField
        elif isinstance(self.dvalue, dict):
            Field = models.ObjectField
        elif isinstance(self.dvalue, str):
            Field = models.StringField
        elif isinstance(self.dvalue, datetime.time):
            Field = models.TimeField

    # Name
    field = Field(name=self.name)

    return field