Skip to content

convert.from_numpy

Convert vector-based ML datasets to tuple-based ILP datasets.

from_numpy(X, y, names=None)

Convert numpy data (X) and target (y) arrays to a RelationalDataset with modes.

Parameters:

Name Type Description Default
X np.ndarray

Integer matrix of covariates

required
y np.ndarray

Integer or float array containing the target variable

required
names Optional[List[str]]

List of strings representing the variable names

None

Returns:

Type Description
Tuple[RelationalDataset, List[str]]

Tuple of RelationalDataset and a list of strings containing the modes

Raises:

Type Description
TypeError

When classification vs. regression cannot be determined from the types of the input values.

Examples:

Demonstrates converting a set of binary classification data.

from relational_datasets.convert import from_numpy
import numpy as np

data, modes = from_numpy(
  np.array([[0, 1, 1], [0, 1, 2], [1, 2, 2]]),
  np.array([0, 0, 1]),
)
Source code in relational_datasets/convert/convert_numpy.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def from_numpy(X: np.ndarray, y: np.ndarray, names: Optional[List[str]] = None) -> Tuple[RelationalDataset, List[str]]:
    """Convert numpy data (`X`) and target (`y`) arrays to a RelationalDataset
    with modes.

    Arguments:
        X: Integer matrix of covariates
        y: Integer or float array containing the target variable
        names: List of strings representing the variable names

    Returns:
        Tuple of `RelationalDataset` and a list of strings containing the modes

    Raises:
        TypeError: When classification vs. regression cannot be determined from
            the types of the input values.

    Examples:

    Demonstrates converting a set of binary classification data.

    ```python
    from relational_datasets.convert import from_numpy
    import numpy as np

    data, modes = from_numpy(
      np.array([[0, 1, 1], [0, 1, 2], [1, 2, 2]]),
      np.array([0, 0, 1]),
    )
    ```

    """

    assert X.shape[0] == y.shape[0]

    # TODO(hayesall): All `enumerate` calls start from `1` to maintain
    #   parity with Julia module.

    # TODO(hayesall): This is a way to "fail fast": if we cannot determine
    #   type of the `y` vector, the conversion is not possible.
    _task = _get_task(y)

    if names:
        assert len(names) == X.shape[1] + 1
    else:
        # + 2 to start from 1.
        names = [f"v{i}" for i in range(1, X.shape[1] + 2)]

    pos, neg, facts = [], [], []

    if _task == "classification":
        for i, row in enumerate(y, 1):
            if row:
                pos.append(f"{names[-1]}(id{i}).")
            else:
                neg.append(f"{names[-1]}(id{i}).")

    elif _task == "multiclass-classification":
        for i, row in enumerate(y, 1):
            pos.append(f"{names[-1]}(id{i},{names[-1]}_{row}).")

    else:
        # _get_task(y) == "regression"
        for i, row in enumerate(y, 1):
            pos.append(f"regressionExample({names[-1]}(id{i}),{row}).")

    for i, col in enumerate(X.T):
        var = names[i]
        facts += [f"{var}(id{j},{var}_{row})." for j, row in enumerate(col, 1)]

    modes = [f"{name}(+id,#var{name})." for name in names[:-1]]
    if _task == "multiclass-classification":
        modes += [f"{names[-1]}(+id,#classlabel)."]
    else:
        modes += [f"{names[-1]}(+id)."]

    return RelationalDataset(pos=pos, neg=neg, facts=facts), modes

Last update: June 20, 2022