Source code for heron.data

"""

The data module is designed to load and prepare arbitrary data sets
for use in machine learning algorithms.

"""

import numpy as np
import copy

[docs]class Data():
    """
    The data class is designed to hold non-timeseries data, and is
    capable of automatically selecting test data from the provided 
    dataset.

    Future development will include the ability to add pre-selected 
    test and verification data to the object.
    """

    def __init__(self, targets, labels, target_sigma = None, label_sigma = None, target_names = None, label_names = None, test_targets=None, test_labels=None, test_size = 0.05):
        """
        Construct the training data object with pre-loaded
        data.

        Parameters
        ----------
        targets : array-like
           An array of training targets or "x" values which are
           to be used to train a machine learning algorithm.

        labels : array-like
           An array of training labels or "y" values which represent
           the observations made at the target locations of the data set.

        target_sigma : array-like
           Either an array of the uncertainty for each target point, or an 
           array of the uncertainties, as a float, for each column in the targets.

        label_sigma : array-like
           Either an array of the uncertainty for each target point, or an 
           array of the uncertainties, as a float, for each column in the labels.

        test_targets : array-like
           A set of test target data, which can be used to test the effectiveness of
           a prediction. If this isn't specified then the test data will be generated
           from the training data using the test_size parameter.

        test_labels : array-like
           The set of labels to accompany the test_targets.

        test_size : float
           The size of the test set as a percentage of the whole data set.
           The test set is selected at random from the data, and is not
           provided to the algorithm as training data.

        Notes
        -----
        Data used in machine learning algorithms is usally
        prepared in sets of "targets", which are the locations
        at which observations are made in some parameter
        space, and "labels", which are the observations made at
        those target points.

        The variable names used in this package will attempt to
        use this convention where practical, although it is not uncommon
        to see targets and labels described as "x" and "y" repsectively
        in the literature, consistent with more traditional methods
        of data analysis.
        """

        self.normaliser = {}
        #targets = np.atleast_2d(targets)
        #labels = np.atleast_2d(labels)
        self.targets = self.normalise(targets, "target")
        self.labels = self.normalise(labels, "label")
        
        # Prepare the sigmas
        # if target_sigma:
        #     # A full array of sigmas for each point
        #     if hasattr(target_sigma, '__len__') and (not isinstance(target_sigma, str)):
        #         if len(target_sigma) == len(targets):
        #             self.target_sigma = self.normalise(target_sigma, "target")
        #         else:
        #             raise ValueError("The length of the uncertainty array doesn't match the data")
        #     # An array with a fixed sigma for each column
        #     else:
        #         self.target_sigma = np.ones(len(target_sigma))*self.normalise(target_sigma, "target")
        # # If no sigma is provided, assume it equals zero
        # else:
        #     self.target_sigma = np.zeros_like(targets)
        # Do the same for the labels
        if label_sigma:
            # A full array of sigmas for each point
            if hasattr(label_sigma, '__len__') and (not isinstance(label_sigma, str)):
                if len(label_sigma) == labels.shape[0]:
                    self.label_sigma = self.normalise(label_sigma, "label")
                else:
                    
                    raise ValueError("The length of the label uncertainty array doesn't match the data")
            # An array with a fixed sigma for each column
            else:
                self.label_sigma = np.ones(len(labels))*self.normalise(label_sigma, "label")
        # If no sigma is provided, assume it equals zero
        else:
            self.label_sigma = np.zeros_like(labels)


        if not isinstance(test_targets, type(None)) and not isinstance(test_labels, type(None)):
            # Targets and labels have been provided, so we'll use those rather than
            # using the input data.
            self.test_targets = np.squeeze(self.normalise(test_targets, "target"))
            self.test_labels = np.squeeze(self.normalise(test_labels, "label"))
            self.labels = np.squeeze(self.labels)
        else:
            # Otherwise we use a portion of the training data.
            # Prepare the test entries
            test_entries = int(np.floor(test_size * len(self.labels)))
            test_entries = np.random.random_integers(0, len(self.labels)-1, test_entries)
            #
            self.test_targets = self.targets[test_entries]
            self.test_labels = self.labels[test_entries]
            #
            self.targets = np.delete(self.targets, test_entries, axis=0)
            self.labels = np.delete(self.labels, test_entries, axis=0)
            #self.target_sigma = np.delete(self.target_sigma, test_entries, axis=0)
            self.label_sigma = np.delete(self.label_sigma, test_entries, axis=0)

        if target_names:
            self.target_names = target_names
        else:
            self.target_names = range(self.targets.shape[-1])


        if label_names:
            self.label_names = label_names
        else:
            self.label_names = range(self.labels.shape[-1])

[docs]    def copy(self):
        """
        Return a copy of this data object.
        """
        return copy.copy(self)
            
[docs]    def name2ix(self, name):
        """
        Convert the name of a column to a column index.
        """
        n2i = {n:i for i, n in enumerate(self.target_names)}
        return n2i[name]

[docs]    def ix2name(self, name):
        """
        Convert the index of a column to a column name.
        """
        i2n = {i:n for i, n in enumerate(self.target_names)}
        return i2n[name]

[docs]    def calculate_normalisation(self, data, name):
        """
        Calculate the offsets for the normalisation. 
        We'll normally want to normalise the training data, and then be able 
        to normalise and denormalise new inputs according to that.

        Parameters
        ----------
        data : array-like
           The array of data to use to calculate the normalisations.
        name : str
           The name to label the constants with.
        """
        data = data
        dc = np.array(data.min(axis=0))
        range = np.array(np.abs(data.max(axis=0) - data.min(axis=0)))
        dc[range==0.0] = np.array(data.min(axis=0))[range==0]
        range[range==0.0] = 1.0

        self.normaliser[name] = (dc, range)
        return (dc, range)

[docs]    def get_starting(self):
        """
        Attempts to guess sensible starting values for the hyperparameter values.

        Returns
        -------
        hyperparameters : ndarray
           An array of values for the various hyperparameters.
        """
        values = []
        for ax in xrange(self.targets.shape[1]):
            values.append(np.median(np.unique(np.diff(self.targets[:, ax])))/2)
        return np.array(values)

    
[docs]    def normalise(self, data, name):
        """
        Normalise a given array of data so that the values of the data
        have a minimum at 0 and a maximum at 1. This improves the 
        computability of the majority of data sets.

        Parameters
        ----------
        data : array-like
           The array of data to be normalised.

        name : str
           The name of the normalisation to be applied, e.g. training or label

        Returns
        -------
        norm_data : array-like
           An array of normalised data.
        scale_factors : array-like
           An array of scale factors. The first is the DC offset, while
           the second is the multiplicative factor.

        Notes
        -----
        In order to perform the normalisation we need two steps:
        1) Subtract the "DC Offset", which is the minimum of the data
        2) Divide by the range of the data
        """
        data = np.array(data)
        if name in self.normaliser:
            dc, range = self.normaliser[name]
        else:
            dc, range = self.calculate_normalisation(data, name)

        if np.any(range) == 0.0:
            return data - dc
        else:
            normalised = (data - dc)
            normalised /= range

        return normalised
        

[docs]    def denormalise(self, data, name):
        """
        Reverse the normalise() method's effect on the data, and return it
        to the correct scaling.

        Parameters
        ----------
        data : array-like
           The normalised data
        scale : array-like
           The scale-factors used to normalise the data.

        Returns
        -------
        array-like
           The denormalised data
        """
        if not name in self.normaliser:
            raise ValueError("There is no normalisation for {}".format(name))
        dc, range = self.normaliser[name]
        return data*range + dc
        #return data

[docs]    def add_data(self, targets, labels, target_sigma=None, label_sigma=None):
        """
        Add new rows into the data object.

        targets : array-like
           An array of training targets or "x" values which are
           to be used to train a machine learning algorithm.

        labels : array-like
           An array of training labels or "y" values which represent
           the observations made at the target locations of the data set.

        target_sigma : array-like
           Either an array of the uncertainty for each target point, or an 
           array of the uncertainties, as a float, for each column in the targets.

        label_sigma : array-like
           Either an array of the uncertainty for each target point, or an 
           array of the uncertainties, as a float, for each column in the labels.
        """
        targets = np.atleast_2d(targets)
        labels = np.atleast_2d(labels)
        if self.targets.shape[0]==1:
            self.targets = np.vstack([self.targets.T, self.normalise(targets, "target")]).T
        else:
            self.targets = np.vstack([self.targets, self.normalise(targets, "target")])

        if self.labels.shape[0] == 1:
            self.labels = np.vstack([self.labels.T, self.normalise(labels, "label")]).T
        else:
            self.labels = np.vstack([self.labels, self.normalise(labels, "label")])

        # Prepare the sigmas
        # if target_sigma:
        #     # A full array of sigmas for each point
        #     if hasattr(target_sigma, '__len__') and (not isinstance(target_sigma, str)):
        #         if len(target_sigma) == len(targets):
        #             if self.target_sigma.shape[0]==1:
        #                 self.target_sigma = np.vstack([self.target_sigma.T, self.normalise(target_sigma, "target")]).T
        #             else:
        #                 self.target_sigma = np.vstack([self.target_sigma, self.normalise(target_sigma, "target")])
        #         else:
        #             raise ValueError("The length of the uncertainty array doesn't match the data")
        #     # An array with a fixed sigma for each column
        #     else:
        #         if self.target_sigma.shape[0]==1:
        #             self.target_sigma = np.vstack([self.target_sigma.T, np.ones(len(target_sigma))*self.normalise(target_sigma, "target")]).T
        #         else:
        #             self.target_sigma = np.vstack([self.target_sigma, np.ones(len(target_sigma))*self.normalise(target_sigma, "target")])
        # # If no sigma is provided, assume it equals zero
        # else:
        #     if self.target_sigma.shape[0]==1:
        #         self.target_sigma = np.vstack([self.target_sigma.T, np.zeros_like(targets)]).T
        #     else:
        #         self.target_sigma = np.vstack([self.target_sigma, np.zeros_like(targets)])
        # Do the same for the labels
        if label_sigma:
            # A full array of sigmas for each point
            if hasattr(label_sigma, '__len__') and (not isinstance(label_sigma, str)):
                if len(label_sigma) == len(labels):
                    if self.label_sigma.shape[0]==1:
                        self.label_sigma = np.vstack([self.label_sigma.T, self.normalise(label_sigma, "label")]).T
                    else:
                        self.label_sigma = np.vstack([self.label_sigma, self.normalise(label_sigma, "label")])
                else:
                    raise ValueError("The length of the uncertainty array doesn't match the data")
            # An array with a fixed sigma for each column
            else:
                if self.label_sigma.shape[0]==1:
                    self.label_sigma = np.vstack([self.label_sigma.T, np.ones(len(label_sigma))*self.normalise(label_sigma, "label")]).T
                else:
                    self.label_sigma = np.vstack([self.label_sigma, np.ones(len(label_sigma))*self.normalise(label_sigma, "label")])
        # If no sigma is provided, assume it equals zero
        else:
            if self.label_sigma.shape[0]==1:
                self.label_sigma = np.vstack([self.label_sigma.T, np.zeros_like(labels)]).T
            else:
                self.label_sigma = np.vstack([self.label_sigma, np.zeros_like(labels)])



                
[docs]class Timeseries():
    """
    This is a class designed to hold timeseries data for machine
    learning algorithms.

    Timeseries data needs to be handled differently from other datasets
    as it is rarely likely to be advantageous to select individual points
    from a timeseries as either test data or verification data.
    Instead the timeseries class will select individual timeseries as the
    test and verification data.
    """

    def __init__(self, targets, labels, target_names = None, label_names = None, test_size = 0.05):
        """
        Construct the training data object with pre-loaded
        data.

        Parameters
        ----------
        targets : array-like
           An array of arrays of time-stamps for each timeseries.

        metadata : array-like
           An array of metadata for each timeseries.

        labels : array-like
           An array of data 

        test_size : float
           The size of the test set as a percentage of the whole data set.
           The test set is selected at random from the data, and is not
           provided to the algorithm as training data.

        Notes
        -----
        Data used in machine learning algorithms is usally
        prepared in sets of "targets", which are the locations
        at which observations are made in some parameter
        space, and "labels", which are the observations made at
        those target points.

        The variable names used in this package will attempt to
        use this convention where practical, although it is not uncommon
        to see targets and labels described as "x" and "y" repsectively
        in the literature, consistent with more traditional methods
        of data analysis.

        In the case of timeseries data we expect the data to arrive with
        a fairly specific format:
        1) Time-stamps: each point in the timeseries must have an associated
                        time.
        2) Time-varying data: a series of data which are recorded at the 
                        specific time-stamped times.
        3) Metadata: information describing the observational or experimental
                        configuration which produced the data.
        For example, if we wished to learn about how temperatures varied over
        the year around some city, we could take measurements with a thermometer 
        at various points in the city at various times. The temperatures would
        constittute the data, the timestamps would be the times each reading was
        made, and the metadata might include details like the coordinates where 
        the measurement was made.
        """
        targets = np.atleast_2d(targets)
        self.targets, self.targets_scale = self.normalise(targets)
        self.labels, self.labels_scale = self.normalise(labels)

        test_entries = np.floor(test_size * len(self.labels))
        test_entries = np.random.random_integers(0, len(self.labels), test_entries)
        #
        self.test_targets = self.targets[test_entries]
        self.test_labels = self.labels[test_entries]
        #
        self.targets = np.delete(self.targets, test_entries, axis=0)
        self.labels = np.delete(self.labels, test_entries, axis=0)

        if target_names:
            self.target_names = target_names
        else:
            self.target_names = range(self.targets.shape[-1])


        if label_names:
            self.label_names = label_names
        else:
            self.label_names = range(self.labels.shape[-1])