Import
Implementation
def simple_bandit(num_actions, #action의 갯수
                  epsilon,     #greedy하게 안 움직일 확률
                  q_stars, #각 action(k개의 bandit)에 대한 reward의 (정규)분포의 mean
                  terminate_cond
                  ):
    if num_actions != len(q_stars):
        print("action의 갯수와 bandit_mean의 갯수는 같아야 함")
        return 
    Q = {}
    n = {}
    #Initialize,for a=1 to k
    for i in range(1,num_actions+1):
        Q[i] = 0
        n[i] = 0
    
    #set probability,reward distribution(bandit)
    max_prob = 1-epsilon
    #epsilon-greedy action
    while True:
        #epsilon-greedy action
        #1.greedy action? or random action?
        greedy_or_random = bernoulli.rvs(p=max_prob,size=1)
        #2.select action
        if greedy_or_random == 1: 
            action = argmax(Q)
        else:
            action = random.sample(Q.keys(),1)[0]
        #sampling reward from gaussian(mean = q_star,variance = 1)
        q_star = q_stars[action-1]
        reward = bandit(q_star) #sampling
        #incremental Q-update
        n[action] +=1
        Q[action] = Q[action] + (1/n[action]) * (reward - Q[action])
        #Terminating
        if np.max(q_stars - np.array(list(Q.values()))) <= terminate_cond:
            break
    return n,Qnum_actions=10
q_stars = norm.rvs(loc=0,scale=1,size=num_actions)
num_action,Q_estimated = simple_bandit(num_actions=num_actions,epsilon=0.1,q_stars=q_stars,terminate_cond=0.01)
for i in range(1,num_actions+1):
    print("action :",i)    
    print("taken_num",num_action[i])
    print("q_star :",q_stars[i-1])
    print("Q_estimated",Q_estimated[i]) 
    print("=============================================")C:\Users\22668\AppData\Local\Temp\ipykernel_5296\2459308436.py:30: DeprecationWarning: Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.
  action = random.sample(Q.keys(),1)[0]
action : 1
taken_num 7710
q_star : 1.0622402967553395
Q_estimated 1.0780006731009761
=============================================
action : 2
taken_num 7750
q_star : -0.04393405775378968
Q_estimated -0.0464016778867334
=============================================
action : 3
taken_num 7688
q_star : -0.2977471187595547
Q_estimated -0.28941229886239545
=============================================
action : 4
taken_num 7766
q_star : -0.6942140347822109
Q_estimated -0.6957038264269383
=============================================
action : 5
taken_num 702767
q_star : 1.114864011478195
Q_estimated 1.1163009140188438
=============================================
action : 6
taken_num 7695
q_star : -2.0791842661153535
Q_estimated -2.0891168451782134
=============================================
action : 7
taken_num 7704
q_star : -1.204190113360393
Q_estimated -1.2086679112146015
=============================================
action : 8
taken_num 7754
q_star : 0.9093721477182354
Q_estimated 0.908951189305117
=============================================
action : 9
taken_num 7658
q_star : 0.0790709385366291
Q_estimated 0.08704136534792037
=============================================
action : 10
taken_num 7736
q_star : -0.06424396205536466
Q_estimated -0.05355515687875377
=============================================