Source code for pynol.utils.data_generator

import math
from abc import ABC, abstractmethod
from typing import Optional

import numpy as np


[docs]class DataGenerator(ABC): """Synthetic data generator.""" def __init__(self): pass
[docs] @abstractmethod def generate_data(self): """Generate data by parameters.""" raise NotImplementedError()
[docs]class LinearRegressionGenerator(DataGenerator): """Generate data for linear regression.""" def __init__(self): super().__init__()
[docs] def generate_data(self, T: int, dimension: int, stage: int = 1, radius: float = 1., Gamma: float = 1., mu: float = 0., sigma: float = 0.05, seed: Optional[int] = None): """Generate linear data with with abrupt changing environment. The synthetic data are generated as follows: at each round, the feature :math:`\\varphi_t \in \mathbb{R}^d` is is randomly generated from a ball with a radius of :math:`\Gamma`, i.e., :math:`\{\\varphi \in \mathbb{R}^d \mid \lVert \\varphi \\rVert_2 \leq \Gamma\}`; the associated label is set as :math:`y_t = \\varphi_t^\\top x_t^* + \epsilon_t`, where :math:`\epsilon_t` is a random Gaussian noise and :math:`x_t^*` is the underlying model. The underlying model :math:`x_t^*` is randomly sampled from a ball with a radius of :math:`R`. To simulate the non-stationary environments with abrupt changes. The total rounds are divided into :math:`S` stages in which the underlying model is forced to be stationary. Args: T (int): Number of total rounds. dimension (int): Dimension of the feature. stage (int): Numbers of stages. radius (float): radius of the underlying model. Gamma (float): radius of the feature. mu (float): Mean ("center") of the noise distribution. sigma (float): Standard deviation (spread or "width") of the noise distribution. Must be non-negative. seed (Optional): Random seed to generate data. """ np.random.seed(seed) random_directions = np.random.normal(size=(dimension, T)) random_directions /= np.linalg.norm(random_directions, axis=0) random_radii = np.random.random(T)**(1 / dimension) feature = Gamma * (random_directions * random_radii).T label = np.zeros(T) step = math.ceil(T / stage) for i in range(stage): random_vec = np.random.normal(size=dimension) x = random_vec / np.linalg.norm( random_vec) * radius * np.random.rand() left, right = i * step, min((i + 1) * step, T) label[left:right] = np.dot(feature[left:right, :], x) noise = np.random.normal(mu, sigma, T) return feature, label + noise