강화학습을 이용한 비트코인 매매프로그램(6)-Buroto Force학습

아빠는 벌레잡이 2022. 12. 4. 23:34

2022. 12. 4. 23:34

import random
from collections import deque, namedtuple
from typing import Dict, List, NamedTuple, Optional, Text, Tuple, Union
# from IPython.display import display, Math
from Account import Account
import math
from itertools import count
import os

import sys
import os.path as path
import numpy as np
import plotly as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
from Market import Market
from PIL import Image
from plotly import express as px
import torchvision
import time
import joblib
import pandas as pd
from dqn import DQN
from memory import ReplayMemory, Experience

action_kind = 21
max_episode = 5000
screen_height = 100
screen_width  = 140
data_size = 600
epsilon = 0.3
dis = 0.9

BATCH_SIZE = 8
# GAMMA = 0.999
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
steps_done = 0
loss = any
WINDOW_START = 0
WINDOW_SIZE  = 1500

# for i in range(torch.cuda.device_count()):
#     print(torch.cuda.get_device_name(i))

# if gpu is to be used
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda")
# device_str = "cpu"
device_str = "cuda"
device = torch.device(device_str)
    
memory = ReplayMemory(BATCH_SIZE)
converter = torchvision.transforms.ToTensor()
market = Market()
train_net = DQN(device, screen_height, screen_width, action_kind, 32).to(device)
train_net = nn.DataParallel(train_net, device_ids=[0,1]).to(device)

episode_durations = []
    
optimizer = optim.RMSprop(train_net.parameters(), lr=0.000001)

# def select_action(df, idx):
#     try:
#         global steps_done
#         sample = random.random()
#         eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
#         steps_done += 1
#         clslvl = ((df.loc[idx, "close"] - df.loc[idx, "closemin"]) * 40 // (df.loc[idx, "closemax"] - df.loc[idx, "closemin"]))
#         if sample > eps_threshold:
#             if df.loc[idx, "closemax"] == df.loc[idx, "close"]:
#                 action = clslvl
#                 return  action
#             elif df.loc[idx, "closemin"] == df.loc[idx, "close"]:
#                 action = clslvl
#                 return  action
#             else:
#                 return clslvl
#         else:
#             action = random.randrange(action_kind)
#             return  action
#     except Exception as ex:
#         exc_type, exc_obj, exc_tb = sys.exc_info()
#         fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#         print("`select_action -> exception! %s : %s %d" % (str(ex) , fname, exc_tb.tb_lineno))
#         return 0

def select_action(df, idx):
    return ((df.loc[idx, "close"] - df.loc[idx, "closemin"]) * (action_kind-1) // (df.loc[idx, "closemax"] - df.loc[idx, "closemin"]))
        
def get_chart(market, idx, max_data):
    img = market.get_chart(idx, max_data=max_data)
    if img is None:
        return None
    # if  idx > (df.index.max() - data_size - 1):
    #     return None
    # img = Image.fromarray(np.uint8(cm.gist_earth(plt.io.to_image(fig, format='png')*255)))
    # im = Image.fromarray(img, bytes=True)
    # im = Image.fromarray(np.uint8(cm.gist_earth(img))/255)
    # im = Image.fromarray(np.uint8(img)/255)
    # img = img.resize((700, 500), resample=Image.BICUBIC)
    # display(img)
    # img = Image.fromarray(cm.gist_earth(plt.io.to_image(fig, format='png'), bytes=True))
    chart = converter(img).unsqueeze(0)
    return chart

def plot_durations(last_chart, curr_chart):
    plt.figure()
    # plt.subplot(1,2,1)
    img = plt.imshow(last_chart.cpu().squeeze(0).permute(1, 2, 0).numpy(), interpolation='none')
    plt.title('Example extracted screen')
    plt.figure(2)
    # plt.subplot(1,2,2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # plt.show()
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    img.set_data(curr_chart.cpu().squeeze(0).permute(1, 2, 0).numpy())
    plt.pause(0.01)  # pause a bit so that plots are updated
    # display.clear_output(wait=True)
    # display.display(plt.gcf())
            
def main():
    if path.exists("pt/train_net_{}.pt".format(device_str)):
        train_net.load_state_dict(torch.load("pt/train_net_{}.pt".format(device_str)))
        
    for _ in range(10):    
        df = market.get_data()
        # df = df.head(WINDOW_SIZE)

        for epoch in range(max_episode):
            account = Account(df, 50000000)
            account.reset()

            last_chart = get_chart(market, data_size, data_size)
            account.reset()

            for idx,_ in enumerate(df.index, start=(data_size + 1)):
                try:
                    since = time.time()
                    curr_chart = get_chart(market, idx, data_size)
                    if curr_chart is None or last_chart is None:
                        continue
                    else:
                        state = curr_chart - last_chart
                        
                    last_chart = curr_chart
                    reward = 0
                    num_action = select_action(df, idx)
                    reward, real_action = account.exec_action(2 if num_action == (action_kind - 1) else (1 if num_action == 0 else 0) , idx)
                    print("idx:%d==>action:%d,real_action==>%d price:%.2f"%(idx, num_action, real_action, df.loc[idx, 'close']))
                    # reward = torch.tensor([reward], device=device)
                    num_action = torch.tensor([[num_action]], device=device, dtype=torch.int64)
                                    
                    memory.push(curr_chart, num_action, reward)
                    if len(memory) >= BATCH_SIZE:
                        epsode = memory.pop(BATCH_SIZE)
                        batch = Experience(*zip(*epsode))
                        # non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                        #                                       batch.next_state)), device=device, dtype=torch.bool)
                        # non_final_next_states = torch.cat([s for s in batch.next_state
                        #                                             if s is not None])
                        state_batch = torch.cat(batch.state)
                        action_batch = torch.cat(batch.action)
                        # reward_batch = torch.cat(batch.reward)
                        train_net.train()
                        state_action_values = train_net(state_batch).gather(1, action_batch)
                        # next_state_values = torch.zeros(BATCH_SIZE, device=device)
                        # next_state_values[non_final_mask] = train_net(non_final_next_states).max(1)[0].detach()
                        # expected_state_action_values = (next_state_values * GAMMA) + reward_batch
                        # print(action_batch)
                        # print(state_action_values)
                        optimizer.zero_grad()
                        criterion = nn.SmoothL1Loss()
                        loss = criterion(state_action_values, action_batch)
                        # print(loss)
                        # loss = criterion(state_action_values, action_batch.unsqueeze(1))
                        # loss = criterion(market_status, train_status)

                        # Optimize the model
                        loss.backward()
                        for param in train_net.parameters():
                            param.grad.data.clamp_(-1, 1)
                        optimizer.step()
                        print("epoch[%d:%d] epsode is next loss[%.10f]" % (epoch, idx, loss.item()))

                    if idx % TARGET_UPDATE == 0:
                        torch.save(train_net.state_dict(),"pt/train_net_{}.pt".format(device_str))
                    
                    spend = time.time() - since
                    print("idx:%d price [%.4f] unit[%.4f] used time[%.2f] agent rate:%.05f remind money:%.02f" 
                            % (idx, df.loc[idx, 'close'], account.unit, spend, account.rate, account.balance + account.unit * df.loc[idx, 'close']))
                    if account.is_bankrupt():
                        break
                    if idx == df.index.max():
                        break
                        # train_net.load_state_dict(train_net.state_dict())
                    # save_file_model(degreeQ)                
                except Exception as ex:
                    exc_type, exc_obj, exc_tb = sys.exc_info()
                    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                    print("`recall_training -> exception! %s : %s %d" % (str(ex) , fname, exc_tb.tb_lineno))

        print("end training DQN")
    
    print('Complete Training')

if __name__ == "__main__":
    main()

main_buroto.py 파일입니다.

import random
from collections import deque, namedtuple

Experience = namedtuple(
    'Experience',
    ('state', 'action', 'reward')
)

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Experience(*args))

    def pop(self, batch_size):
        arr = []
        for _ in range(batch_size):
            arr.append(self.memory.popleft())
        return arr
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

memory.py 부분을 분리했습니다.

import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, device, h, w, outputs, qsize):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, h*w, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(h*w)
        self.conv2 = nn.Conv2d(h*w, qsize, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(qsize)
        self.device = device
        self.conv3 = nn.Conv2d(qsize, qsize, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(qsize)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        # print("convw[%d]  convh[%d]" % (convw, convh))
        linear_input_size = 54 * qsize
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(self.device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

dqn.py를 분리했습니다.

main함수부터 설명 드리면 가장 이상적인 매매 프로세스인 저점에서 매수하고 고점에서 매도하는 방식을 rolling함수를 이용하여 구하였습니다.
그리고 이것을 전체 데이터에 적용을 하여 history에 저장합니다.
매매 길이에 따라서 수천프로이상의 수익이 발생할 것입니다.
이것은 말 그대로 이상적인 매매 프로세스 입니다.
백프로 매치되는 것은 아니지만 이상적인 grid world의 마르코프 실행 프로세서(MDP) 처럼은 우리도 이것을 MDP로 가정합니다.
이제 실행된 결과를 DQN(CNN)에 학습합니다.
optimize함수가 그 역할을 합니다.
loss함수는 nn.SmoothL1Loss를 사용하여 해당 S의 action a와 기대 이상 행동 a'를 구하여 둘의 차이를 구하여 계산합니다.
아쉽지만 windows에서는 NCCL이 적용되지 않아 pytorch함수에서 에러가 발생합니다.
NCCL은 nvidia의 GPU를 서로 연동하여 최상의 성능을 내는 Api lib입니다.
현재 ubuntu와 red hat에서만 지원하고 있습니다.
그래서 지금 저는 WSL2를 이용한 ubuntu LTS(최종안정화버젼)인 20.04.5버젼으로 이전을 하려고 하는데 windows10과 wsl2의 병행 운영은 엄청난 cpu 제원을 요구 하네요.
어쩔 수 없이 서버를 ubuntu로 다시 설치하고 노트북으로 접속하여 이용하는 방법을 실행하려고 하고 있습니다.
불행히도 제 노트북이 현재 아작이 난 상태라 알리에서 검색해서 LCD판넬을 주문한 상태입니다만 가장 빠른 날짜가 12월 15일 도착이라고 하네요.
Nvidia 홈페이지 상으로는 현재로써는 windows에 NCCL이 적용이 안되어서 여러장의 GPU카드가 있어도 한장의 효과만 있다는 거죠.
현재 소스에서 보시면 nn.DataParallel이 DQN을 감싸고 있습니다.
실제 성능탭으로 보면 그래도 nn.DataParallel을 적용하기 전에는 한개의 GPU만 사용하게 작동이 되었으나 지금은 어느 정도 반반씩 부담하여 사용하는게 보입니다.
중요한 점은

recall_history()함수에서 저장된 데이터를 recall_training 함수에서 DQN을 학습하게 됩니다. 학습된 결과는 pt/training_net.pt파일로 저장을 하게 됩니다. 저의 경우는 데이터가 너무 많은 관계로 과적합 상태를 보이나 로스차이가 그렇게 크지는 않습니다. 적당한 데이터의 크기는 5000~10000개 정도의 데이터를 반복 학습하면서 loss를 최소화 한다음 다시 데이터를 샘플링하고 적용하고 하는 반복 작업을 통하여 실제 DQN이 적은 loss로 동작하는지가 중요합니다.

이 작업이 어느 정도 끝나면 다음은 main_agent.py 파일을 작성하여 실제 상황에서 detection을 발생하여 거래를 하게 하여 어느 정도의 수익률이 나오는지 분석을 해 보도록 하겠습니다.
저도 처음에는 강화학습이 데이터가 많이 적용되면 더욱 딕텍션이 안정될 거라 생각했지만 실제로는 데이터가 많을 수록 ai는 거래를 하지 않으려하네요.
프로그램을 조금 수정해서 pt파일을 여러 버젼으로 저장하는게 어떨까 제안 합니다.
다음 차에서 agent파일을 돌려 보면 알겠지만 DQN은 가장 안정화된 방법을 선택하려 합니다.
즉 매매를 하지 않아 수익도 손실도 없는 상태를 만들려고 합니다.
그렇지만 어떤 경우 훈련이 잘 진행되어 100%이상의 수익을 내기도 합니다.
아무런 if문 없이 수익을 낸다는 게 신기합니다만 retro.gym을 경험해 보신분들은 아실 겁니다.
사실 retro.gym을 훈련하는 과정에도 인간의 생각 판단 따위는 중요하지 않습니다.
훈련은 우연의 연속입니다.
다만 그 우연들 중에서 가장 스코어가 높은 쪽으로만 AI는 움직이려 합니다.
그렇게 하는것은 인간의 계산에서 epsilon값을 어떻게 판단하는냐에 따라 결과가 달라 질 겁니다.
epsilon을 0.5 정도로 하면 절반은 모험값으로 채워질 겁니다.
그리고 결과를 수익률 또는 남은 돈으로 정할 때 그 값이 최고가 되는 모델들만 history에 저장하고 학습을 돌린다면 엔진은 공격적인 매매를 해서 높은 수익률을 올릴 것입니다.
다음 편 실 agent 시뮬레이션 편을 기대 해 주세요.

LIST

저작자표시 비영리 (새창열림)

'python > 자동매매 프로그램' 카테고리의 다른 글

강화학습을 이용한 비트코인 매매프로그램(9) - CNN+RNN 모델 (0)	2022.12.20
강화학습을 이용한 비트코인 매매프로그램(8) - wsl이용하기 (3)	2022.12.12
강화학습을 이용한 비트코인 매매프로그램(5)-데이터에 Min/Max 추가하기 (0)	2022.11.30
강화학습을 이용한 비트코인 매매프로그램(4)-LSTM의 주가분석 문제점 (1)	2022.11.30
강화학습을 이용한 비트코인 매매프로그램(3) - 데이터로 차트 그리기 (1)	2022.11.28

오늘도 아빠는 벌레잡는 중

강화학습을 이용한 비트코인 매매프로그램(6)-Buroto Force학습

'python > 자동매매 프로그램' 카테고리의 다른 글

+ Recent posts

티스토리툴바