Source code for nwm_region_mgr.parreg.synthetic_data
"""Function to create synthetic data for training Unsupervised Random Forest (URF) models."""
import numpy
[docs]
def create_synthetic_data(x, synthetic_data_type):
"""Create synthetic data for RR dissimilarity.
:param X:
:param kwargs:
:return:
"""
nof_objects = x.shape[0]
if synthetic_data_type is None:
synthetic_data_type = "default"
if synthetic_data_type == "default":
synthetic_x = default_synthetic_data(x)
x_total = numpy.concatenate([x, synthetic_x])
elif synthetic_data_type == "f":
synthetic_x = f_synthetic_data(x)
x_total = numpy.concatenate([numpy.hstack(x), synthetic_x])
else:
print("Bad synthetic data type")
return -1
y_total = numpy.concatenate([numpy.zeros(nof_objects), numpy.ones(nof_objects)])
return x_total, y_total
[docs]
def f_synthetic_data(x_list):
"""Synthetic data with same marginal distribution for each feature."""
x = numpy.hstack(x_list)
synthetic_x = numpy.zeros(x.shape)
nof_chunks = len(x_list)
nof_objects = x.shape[0]
chunks_inds = numpy.random.choice(
numpy.arange(nof_objects), [nof_objects, nof_chunks]
)
for i in range(nof_objects):
x = [x_list[c][chunks_inds[i, c]] for c in range(nof_chunks)]
synthetic_x[i] = numpy.hstack(x)
return synthetic_x
[docs]
def default_synthetic_data(x):
"""Synthetic data with same marginal distribution for each feature."""
synthetic_x = numpy.zeros(x.shape)
nof_features = x.shape[1]
nof_objects = x.shape[0]
for f in range(nof_features):
feature_values = x[:, f]
synthetic_x[:, f] += numpy.random.choice(feature_values, nof_objects)
return synthetic_x