import

import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
import numpy as np
import torch
test_path = "./test.csv"
train_path = "./train.csv"

# %pip install plotly (jupyter notebook)
from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
#pio.renderers.default = 'iframe_connected'
#pio.renderers.default = "vscode"
pio.renderers.default = "plotly_mimetype+notebook"

Data

train = pd.read_csv(train_path).drop(columns = ["id"])
train_len = len(train)
test = pd.read_csv(test_path)
id_test = test["id"]
test = pd.read_csv(test_path).drop(columns = ["id"])

dataset = pd.concat([train,test],axis=0)
dataset = dataset.drop(columns = ["father","mother","gender"])

dataset.head(5)

	trait	SNP_01	SNP_02	SNP_03	SNP_04	SNP_05	SNP_06	SNP_07	SNP_08	SNP_09	SNP_10	SNP_11	SNP_12	SNP_13	SNP_14	SNP_15	class
0	2	G G	A G	A A	G A	C A	A A	A A	G G	A A	G G	A G	A A	A A	A A	A A	B
1	2	A G	A G	C A	A A	A A	A G	A A	G A	A A	A G	A A	G A	G G	A A	A A	C
2	2	G G	G G	A A	G A	C C	G G	A A	G A	G A	A G	A A	A A	A A	A A	A A	B
3	1	A A	G G	A A	G A	A A	G G	G G	A A	G G	A G	G G	G G	G G	A A	G G	A
4	2	G G	G G	C C	A A	C C	A A	A A	A A	A A	G G	A A	A A	A G	A A	G A	C

Preprocessing

_t = []
for val in dataset.SNP_01 == "G G":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has01GG"] = _t

_t = []
for val in dataset.SNP_02 == "A A":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has02AA"] = _t

def create_col(dataset,col,value):
    _t = []
    for val in dataset[col] == value:
        if val == True:
            _t.append(1)
        else:
            _t.append(0)
    
    col_name_base = "has"+col[-2:]
    value_name = ""
    for chr in value:
        if chr != " ":
            value_name+=chr
    col_name = col_name_base+value_name
    #print(col_name)
    dataset[col_name] = _t

    return dataset

dataset = create_col(dataset,"SNP_03","A A")
dataset = create_col(dataset,"SNP_04","G G")
dataset = create_col(dataset,"SNP_05","C C")
dataset = create_col(dataset,"SNP_06","A A")
dataset = create_col(dataset,"SNP_07","A A")
dataset = create_col(dataset,"SNP_07","G G")
dataset = create_col(dataset,"SNP_08","G G")

dataset = create_col(dataset,"SNP_09","A A")
dataset = create_col(dataset,"SNP_09","G G")
dataset = create_col(dataset,"SNP_11","A A")

dataset = create_col(dataset,"SNP_12","A A")
dataset = create_col(dataset,"SNP_12","G G")

dataset = create_col(dataset,"SNP_13","A A")
dataset = create_col(dataset,"SNP_14","A A")

#one-hot encoding for distance base algorithm
dataset_ohe = pd.get_dummies(dataset,columns = dataset.columns.drop("class"),drop_first=True) #multicollinearity를 막기위한 drop_first 옵션
train_ohe = dataset_ohe[:train_len].copy()
test_ohe = dataset_ohe[train_len:].copy().drop(columns="class")

class_map = {"A":0,"B":1,"C":2}
train_ohe["class"]=train_ohe["class"].map(class_map).astype(int)
X_train_ohe = train_ohe.drop(columns = "class")
Y_train_ohe = train_ohe["class"] #X와 통일성을 위해 ohe로 일단 이름지음

Y_train_ohe[:5]

0    1
1    2
2    1
3    0
4    2
Name: class, dtype: int32

X_train_ohe = torch.from_numpy(X_train_ohe.values).float()
#y_train_ohe = torch.from_numpy(pd.get_dummies(Y_train_ohe).values).float()
Y_train_ohe = torch.from_numpy(Y_train_ohe.values).long()

Dimension Reduction using autoencoder with pytorch

class Encoder(nn.Module):
    def __init__(self,in_features,encoding_features):
        super().__init__()
        self.in_features = in_features
        self.encoding_features = encoding_features
        self.linr = torch.nn.Linear(in_features,encoding_features)
        self.active_func = torch.nn.ReLU()
    def forward(self,x):
        out = self.active_func(self.linr(x))
        return out
        
class Decoder(nn.Module):
    def __init__(self,encoding_features,out_features):
        super().__init__()
        self.encoding_features = encoding_features
        self.out_features = out_features
        self.linr = torch.nn.Linear(encoding_features,out_features)
    def forward(self,x):
        out = self.linr(x)
        return out

class AutoEncoder(nn.Module):
    def __init__(self,in_features,encoding_features):
        super().__init__()
        out_features = in_features
        self.encoder = Encoder(in_features,encoding_features)
        self.decoder = Decoder(encoding_features,out_features)
    def forward(self,x):
        out = self.encoder(x)
        out = self.decoder(out)
        return out

encoding dimension=3

training autoencoder

torch.manual_seed(201711375)
autoencoder_3 = AutoEncoder(47,3)
loss_fn = torch.nn.MSELoss()
relu = torch.nn.LeakyReLU()
optimizer = torch.optim.Adam(autoencoder_3.parameters(),lr=0.001)

for epoch in range(20000):
    #1.yhat
    out = autoencoder_3(X_train_ohe)
    #2
    loss = loss_fn(out,X_train_ohe)
    #3
    loss.backward()
    if epoch % 10000 == 0:
        print(f"epoch:{epoch} loss:{loss.tolist()}")
    #4
    optimizer.step()
    optimizer.zero_grad()

epoch:0 loss:0.5274774432182312
epoch:10000 loss:0.11526338756084442

visualization

class_map_inv = {}
for key,value in class_map.items():
    class_map_inv[value] = key
class_map_inv

{0: 'A', 1: 'B', 2: 'C'}

dt_dim3 = pd.DataFrame({"class":Y_train_ohe})
dt_dim3 = pd.concat([pd.DataFrame(np.array(autoencoder_3.encoder(X_train_ohe).tolist())),dt_dim3],axis=1)
dt_dim3 = dt_dim3.rename(columns = {0:"x",1:"y",2:"z"})

count = 0
data = []
for cl in dt_dim3["class"].unique():
    cond = dt_dim3["class"] == cl
    _data = dt_dim3.loc[cond,:]
    x = _data.x.tolist()
    y = _data.y.tolist()
    z = _data.z.tolist()
    if count == 0:
        color = "red"
    elif count == 1:
        color = "blue"
    else:
        color = "black"
    trace=go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode="markers",
        marker = dict(color = color,size=2),
        name = str(class_map_inv[cl])
        )
    data.append(trace)
    count+=1

layout = go.Layout(title=dict(text = "3-dimension "))

#4. figure
fig = go.Figure(data=data,layout=layout)
fig.show()

encoding dimension=10

torch.manual_seed(201711375)
autoencoder_3 = AutoEncoder(47,10)
loss_fn = torch.nn.MSELoss()
relu = torch.nn.LeakyReLU()
optimizer = torch.optim.Adam(autoencoder_3.parameters(),lr=0.001)

for epoch in range(40000):
    #1.yhat
    out = autoencoder_3(X_train_ohe)
    #2
    loss = loss_fn(out,X_train_ohe)
    #3
    loss.backward()
    if epoch % 10000 == 0:
        print(f"epoch:{epoch} loss:{loss.tolist()}")
    #4
    optimizer.step()
    optimizer.zero_grad()

epoch:0 loss:0.3828417658805847
epoch:10000 loss:0.060432590544223785
epoch:20000 loss:0.06043253839015961
epoch:30000 loss:0.060432031750679016

참고링크

링크1
링크2