Source code for chocolate.connection.pandas

from contextlib import contextmanager
import pickle

import numpy
import pandas

from ..base import Connection


[docs]class DataFrameConnection(Connection):
    """Connection to a pandas DataFrame.

    This connection is meant when it is not possible to use the file system
    or other type of traditional database (e.g. a `Kaggle <http://kaggle.com>`_
    scripts) and absolutely not in concurrent processes. In fact, using this
    connection in different processes will result in two independent searches
    **not** sharing any information.

    Args:
        from_file: The name of a file containing a pickled data frame
            connection.

    Using this connection requires small adjustments to the proposed main
    script. When the main process finishes, all data will vanish if not
    explicitly writen to disk. Thus, instead of doing a single evaluation,
    the main process will incorporate a loop calling the search/sample
    ``next`` method multiple times. Additionally, at the end of the experiment,
    either extract the best configuration using :meth:`results_as_dataframe`
    or write all the data using :mod:`pickle`.
    """
    def __init__(self, from_file=None):
        if from_file is not None:
            with open(from_file, "rb") as f:
                conn = pickle.load(f.read())

            if type(conn) != DataFrameConnection:
                raise TypeError("Unpickled connection is not of type DataFrameConnection")

            self.results = conn.results
            self.complementary = conn.complementary
            self.space = conn.space

        else:
            self.results = pandas.DataFrame()
            self.complementary = pandas.DataFrame()
            self.space = None

[docs]    @contextmanager
    def lock(self, *args, **kwargs):
        """This function does not lock anything. Do not use in concurrent
        processes.
        """
        yield

[docs]    def all_results(self):
        """Get a list of all entries of the result table. The order is
        undefined.
        """
        return list(self.results.T.to_dict().values())

    def find_results(self, filter):
        """Get a list of all results associated with *filter*. The order is
        undefined.
        """
        selection = self.results
        for k, v in filter.items():
            selection = selection[selection[k] == v]
        return list(selection.T.to_dict().values())

[docs]    def insert_result(self, document):
        """Insert a new *document* in the result data frame. The columns does
        not need to be defined nor all present. Any new column will be added
        to the database and any missing column will get value None.
        """
        self.results = self.results.append(document, ignore_index=True)

[docs]    def update_result(self, document, value):
        """Update or add *value* of given rows in the result data frame.

        Args:
            document: An identifier of the rows to update.
            value: A mapping of values to update or add.
        """
        size = len(self.results.index)
        selection = [True] * size
        for k, v in document.items():
            selection = numpy.logical_and(self.results[k] == v, selection)

        for k, v in value.items():
            if not k in self.results:
                self.results[k] = pandas.Series([None] * size)
            self.results.loc[selection, k] = v

[docs]    def count_results(self):
        """Get the total number of entries in the result table."""
        return len(self.results.index)

[docs]    def all_complementary(self):
        """Get all entries of the complementary information table as a list.
        The order is undefined.
        """
        return list(self.complementary.T.to_dict().values())

[docs]    def insert_complementary(self, document):
        """Insert a new document (row) in the complementary information data frame."""
        self.complementary = self.complementary.append(document, ignore_index=True)

[docs]    def find_complementary(self, filter):
        """Find a document (row) from the complementary information data frame."""
        selection = self.complementary
        for k, v in filter.items():
            selection = selection[selection[k] == v]
        return list(selection.T.to_dict().values())[0]

[docs]    def get_space(self):
        """Returns the space used for previous experiments."""
        return self.space

[docs]    def insert_space(self, space):
        """Insert a space in the database.

        Raises:
            AssertionError: If a space is already present.
        """
        assert self.space is None, "Space table cannot contain more than one space, clear table first."
        self.space = space

[docs]    def clear(self):
        """Clear all data."""
        self.results = pandas.DataFrame()
        self.complementary = pandas.DataFrame()
        self.space = None

    def pop_id(self, document):
        """Pops the database unique id from the document."""
        return document