Source code for dffml.source.dataframe

"""
Expose Pandas DataFrame as DFFML Source
"""
from typing import AsyncIterator, List


from ..record import Record
from ..base import config, field
from ..util.entrypoint import entrypoint
from ..util.net import DEFAULT_PROTOCOL_ALLOWLIST
from .source import BaseSourceContext, BaseSource


[docs]class DataFrameSourceContext(BaseSourceContext):
[docs] async def update(self, record: Record): # Shorthand for DataFrame df = self.parent.config.dataframe # Store feature data features = record.features() for col in df.columns: if col in features: df.loc[record.key, col] = features[col] # Store prediction predictions = record.predictions() for col in self.parent.config.predictions: if col in predictions: df.loc[record.key, col] = predictions[col]["value"]
[docs] async def records(self) -> AsyncIterator[Record]: for row in self.parent.config.dataframe.itertuples(): features = dict(row._asdict()) predictions = { key: {"value": features[key]} for key in self.parent.config.predictions } del features["Index"] for key in predictions.keys(): if key in features: del features[key] yield Record( str(row.Index), data={"features": features, "prediction": predictions}, )
[docs] async def record(self, key: str) -> Record: data = self.parent.config.dataframe.iloc[int(key)] predictions = { key: data[key] for key in self.parent.config.predictions } features = { key: value for key in data.items() if key not in predictions } return Record( str(key), data={"features": features, "prediction": predictions}, )
[docs]@config class DataFrameSourceConfig: dataframe: "pandas.DataFrame" = field( "The pandas DataFrame to proxy", default=None ) predictions: List[str] = field( "Prediction columns whose values we have to update", default_factory=lambda: [], ) # TODO Get rid of this basic appoach when we implement #1168 html: str = field( "Construct a DataFrame using DataFrame.read_html(). Passing this as URL", default=None, ) html_table_index: int = field( "If there are multiple html tables on a page, which one? Array indexed" ", so first table means 0, if you want the second table on the page" ", use 1 here.", default=0, ) protocol_allowlist: List[str] = field( 'List of protocols allowed for ``html`` URL. Example ``["http://"]``', default_factory=lambda: DEFAULT_PROTOCOL_ALLOWLIST, )
[docs]@entrypoint("dataframe") class DataFrameSource(BaseSource): r""" Proxy for a pandas DataFrame Examples -------- You can pass a pandas DataFrame to this class directly via the Python API. Or you can create DataFrames from other data sources via the Python API or the command line. **Example of creating a DataFrame from HTML via command line.** Create an HTML table. **index.html** .. code-block:: html :test: :filepath: index.html <table> <tr> <th>Years</th> <th>Salary</th> </tr> <tr> <td>0</td> <td>10</td> </tr> <tr> <td>1</td> <td>20</td> </tr> <tr> <td>2</td> <td>30</td> </tr> </table> Start the HTTP server to server the HTML page with the table .. code-block:: console :test: :daemon: 8000 $ python -m http.server 8000 In another terminal. List all the records in the source. .. code-block:: console :test: :replace: cmds[0][-3] = cmds[0][-3].replace("8000", str(ctx["HTTP_SERVER"]["8000"])) $ dffml list records \ -sources table=dataframe \ -source-table-html http://127.0.0.1:8000/index.html \ -source-table-protocol_allowlist http:// [ { "extra": {}, "features": { "Salary": 10, "Years": 0 }, "key": "0" }, { "extra": {}, "features": { "Salary": 20, "Years": 1 }, "key": "1" }, { "extra": {}, "features": { "Salary": 30, "Years": 2 }, "key": "2" } ] """ CONFIG = DataFrameSourceConfig CONTEXT = DataFrameSourceContext def __init__(self, config): super().__init__(config) # Create DataFrame if not given if self.config.dataframe is None: try: # Try import import pandas except (ModuleNotFoundError, ImportError) as error: # If it fails say that pandas must be installed to create new # DataFrames raise PandasNotInstalled( "Pandas is required to create new DataFrames. $ pip install pandas" ) from error # TODO Modify this in line with changes for #1168 if self.config.html is not None: dataframes = pandas.read_html(self.config.html) if self.config.html_table_index >= len(dataframes): raise DataFrameHTMLTableIndexNotFoundError( f"Index {self.config.html_table_index} requested" f" {len(dataframes)} table(s) found." ) self.config.dataframe = dataframes[ self.config.html_table_index ] else: # Create empty DataFrame self.config.dataframe = pandas.DataFrame()