import
# %pip install plotly (jupyter notebook)
from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
#pio.renderers.default = 'iframe_connected'
#pio.renderers.default = "vscode"
pio.renderers.default = "plotly_mimetype+notebook"
Data
trait | SNP_01 | SNP_02 | SNP_03 | SNP_04 | SNP_05 | SNP_06 | SNP_07 | SNP_08 | SNP_09 | SNP_10 | SNP_11 | SNP_12 | SNP_13 | SNP_14 | SNP_15 | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | G G | A G | A A | G A | C A | A A | A A | G G | A A | G G | A G | A A | A A | A A | A A | B |
1 | 2 | A G | A G | C A | A A | A A | A G | A A | G A | A A | A G | A A | G A | G G | A A | A A | C |
2 | 2 | G G | G G | A A | G A | C C | G G | A A | G A | G A | A G | A A | A A | A A | A A | A A | B |
3 | 1 | A A | G G | A A | G A | A A | G G | G G | A A | G G | A G | G G | G G | G G | A A | G G | A |
4 | 2 | G G | G G | C C | A A | C C | A A | A A | A A | A A | G G | A A | A A | A G | A A | G A | C |
Preprocessing
def create_col(dataset,col,value):
_t = []
for val in dataset[col] == value:
if val == True:
_t.append(1)
else:
_t.append(0)
col_name_base = "has"+col[-2:]
value_name = ""
for chr in value:
if chr != " ":
value_name+=chr
col_name = col_name_base+value_name
#print(col_name)
dataset[col_name] = _t
return dataset
dataset = create_col(dataset,"SNP_03","A A")
dataset = create_col(dataset,"SNP_04","G G")
dataset = create_col(dataset,"SNP_05","C C")
dataset = create_col(dataset,"SNP_06","A A")
dataset = create_col(dataset,"SNP_07","A A")
dataset = create_col(dataset,"SNP_07","G G")
dataset = create_col(dataset,"SNP_08","G G")
dataset = create_col(dataset,"SNP_09","A A")
dataset = create_col(dataset,"SNP_09","G G")
dataset = create_col(dataset,"SNP_11","A A")
dataset = create_col(dataset,"SNP_12","A A")
dataset = create_col(dataset,"SNP_12","G G")
dataset = create_col(dataset,"SNP_13","A A")
dataset = create_col(dataset,"SNP_14","A A")
#one-hot encoding for distance base algorithm
dataset_ohe = pd.get_dummies(dataset,columns = dataset.columns.drop("class"),drop_first=True) #multicollinearity를 막기위한 drop_first 옵션
train_ohe = dataset_ohe[:train_len].copy()
test_ohe = dataset_ohe[train_len:].copy().drop(columns="class")
class_map = {"A":0,"B":1,"C":2}
train_ohe["class"]=train_ohe["class"].map(class_map).astype(int)
X_train_ohe = train_ohe.drop(columns = "class")
Y_train_ohe = train_ohe["class"] #X와 통일성을 위해 ohe로 일단 이름지음
Dimension Reduction using autoencoder with pytorch
class Encoder(nn.Module):
def __init__(self,in_features,encoding_features):
super().__init__()
self.in_features = in_features
self.encoding_features = encoding_features
self.linr = torch.nn.Linear(in_features,encoding_features)
self.active_func = torch.nn.ReLU()
def forward(self,x):
out = self.active_func(self.linr(x))
return out
class Decoder(nn.Module):
def __init__(self,encoding_features,out_features):
super().__init__()
self.encoding_features = encoding_features
self.out_features = out_features
self.linr = torch.nn.Linear(encoding_features,out_features)
def forward(self,x):
out = self.linr(x)
return out
class AutoEncoder(nn.Module):
def __init__(self,in_features,encoding_features):
super().__init__()
out_features = in_features
self.encoder = Encoder(in_features,encoding_features)
self.decoder = Decoder(encoding_features,out_features)
def forward(self,x):
out = self.encoder(x)
out = self.decoder(out)
return out
encoding dimension=3
training autoencoder
for epoch in range(20000):
#1.yhat
out = autoencoder_3(X_train_ohe)
#2
loss = loss_fn(out,X_train_ohe)
#3
loss.backward()
if epoch % 10000 == 0:
print(f"epoch:{epoch} loss:{loss.tolist()}")
#4
optimizer.step()
optimizer.zero_grad()
epoch:0 loss:0.5274774432182312
epoch:10000 loss:0.11526338756084442
visualization
{0: 'A', 1: 'B', 2: 'C'}
count = 0
data = []
for cl in dt_dim3["class"].unique():
cond = dt_dim3["class"] == cl
_data = dt_dim3.loc[cond,:]
x = _data.x.tolist()
y = _data.y.tolist()
z = _data.z.tolist()
if count == 0:
color = "red"
elif count == 1:
color = "blue"
else:
color = "black"
trace=go.Scatter3d(
x=x,
y=y,
z=z,
mode="markers",
marker = dict(color = color,size=2),
name = str(class_map_inv[cl])
)
data.append(trace)
count+=1
layout = go.Layout(title=dict(text = "3-dimension "))
#4. figure
fig = go.Figure(data=data,layout=layout)
fig.show()
encoding dimension=10
for epoch in range(40000):
#1.yhat
out = autoencoder_3(X_train_ohe)
#2
loss = loss_fn(out,X_train_ohe)
#3
loss.backward()
if epoch % 10000 == 0:
print(f"epoch:{epoch} loss:{loss.tolist()}")
#4
optimizer.step()
optimizer.zero_grad()
epoch:0 loss:0.3828417658805847
epoch:10000 loss:0.060432590544223785
epoch:20000 loss:0.06043253839015961
epoch:30000 loss:0.060432031750679016