Source code for dffml.source.dataframe

"""
Expose Pandas DataFrame as DFFML Source
"""
from typing import AsyncIterator, List


from ..record import Record
from ..base import config, field
from ..util.entrypoint import entrypoint
from ..util.net import DEFAULT_PROTOCOL_ALLOWLIST
from .source import BaseSourceContext, BaseSource


[docs]class DataFrameSourceContext(BaseSourceContext):
[docs]    async def update(self, record: Record):
        # Shorthand for DataFrame
        df = self.parent.config.dataframe
        # Store feature data
        features = record.features()
        for col in df.columns:
            if col in features:
                df.loc[record.key, col] = features[col]
        # Store prediction
        predictions = record.predictions()
        for col in self.parent.config.predictions:
            if col in predictions:
                df.loc[record.key, col] = predictions[col]["value"]

[docs]    async def records(self) -> AsyncIterator[Record]:
        for row in self.parent.config.dataframe.itertuples():
            features = dict(row._asdict())
            predictions = {
                key: {"value": features[key]}
                for key in self.parent.config.predictions
            }
            del features["Index"]
            for key in predictions.keys():
                if key in features:
                    del features[key]
            yield Record(
                str(row.Index),
                data={"features": features, "prediction": predictions},
            )

[docs]    async def record(self, key: str) -> Record:
        data = self.parent.config.dataframe.iloc[int(key)]
        predictions = {
            key: data[key] for key in self.parent.config.predictions
        }
        features = {
            key: value for key in data.items() if key not in predictions
        }
        return Record(
            str(key), data={"features": features, "prediction": predictions},
        )


[docs]@config
class DataFrameSourceConfig:
    dataframe: "pandas.DataFrame" = field(
        "The pandas DataFrame to proxy", default=None
    )
    predictions: List[str] = field(
        "Prediction columns whose values we have to update",
        default_factory=lambda: [],
    )
    # TODO Get rid of this basic appoach when we implement #1168
    html: str = field(
        "Construct a DataFrame using DataFrame.read_html(). Passing this as URL",
        default=None,
    )
    html_table_index: int = field(
        "If there are multiple html tables on a page, which one? Array indexed"
        ", so first table means 0, if you want the second table on the page"
        ", use 1 here.",
        default=0,
    )
    protocol_allowlist: List[str] = field(
        'List of protocols allowed for ``html`` URL. Example ``["http://"]``',
        default_factory=lambda: DEFAULT_PROTOCOL_ALLOWLIST,
    )


[docs]@entrypoint("dataframe")
class DataFrameSource(BaseSource):
    r"""
    Proxy for a pandas DataFrame

    Examples
    --------

    You can pass a pandas DataFrame to this class directly via the Python API.
    Or you can create DataFrames from other data sources via the Python API or
    the command line.

    **Example of creating a DataFrame from HTML via command line.**

    Create an HTML table.

    **index.html**

    .. code-block:: html
        :test:
        :filepath: index.html

        <table>
          <tr>
            <th>Years</th>
            <th>Salary</th>
          </tr>
          <tr>
            <td>0</td>
            <td>10</td>
          </tr>
          <tr>
            <td>1</td>
            <td>20</td>
          </tr>
          <tr>
            <td>2</td>
            <td>30</td>
          </tr>
        </table>

    Start the HTTP server to server the HTML page with the table

    .. code-block:: console
        :test:
        :daemon: 8000

        $ python -m http.server 8000

    In another terminal. List all the records in the source.

    .. code-block:: console
        :test:
        :replace: cmds[0][-3] = cmds[0][-3].replace("8000", str(ctx["HTTP_SERVER"]["8000"]))

        $ dffml list records \
            -sources table=dataframe \
            -source-table-html http://127.0.0.1:8000/index.html \
            -source-table-protocol_allowlist http://

        [
            {
                "extra": {},
                "features": {
                    "Salary": 10,
                    "Years": 0
                },
                "key": "0"
            },
            {
                "extra": {},
                "features": {
                    "Salary": 20,
                    "Years": 1
                },
                "key": "1"
            },
            {
                "extra": {},
                "features": {
                    "Salary": 30,
                    "Years": 2
                },
                "key": "2"
            }
        ]

    """

    CONFIG = DataFrameSourceConfig
    CONTEXT = DataFrameSourceContext

    def __init__(self, config):
        super().__init__(config)
        # Create DataFrame if not given
        if self.config.dataframe is None:
            try:
                # Try import
                import pandas
            except (ModuleNotFoundError, ImportError) as error:
                # If it fails say that pandas must be installed to create new
                # DataFrames
                raise PandasNotInstalled(
                    "Pandas is required to create new DataFrames. $ pip install pandas"
                ) from error
            # TODO Modify this in line with changes for #1168
            if self.config.html is not None:
                dataframes = pandas.read_html(self.config.html)
                if self.config.html_table_index >= len(dataframes):
                    raise DataFrameHTMLTableIndexNotFoundError(
                        f"Index {self.config.html_table_index} requested"
                        f" {len(dataframes)} table(s) found."
                    )
                self.config.dataframe = dataframes[
                    self.config.html_table_index
                ]
            else:
                # Create empty DataFrame
                self.config.dataframe = pandas.DataFrame()