13 Example 2

In the previous example, we simply assumed that the aggent received a reward with magnitude 1 on the first 1/4 and 3/4 of all trials.
Therefore, whether or not the agent received a reward was independent of what the agent actually did.
This means that the previous example was an instance of classical conditioning – think Pavlov’s dog.
Here, we will simulate a scenario where whether or not the agent receives a reward depends on the action that it takes – i.e., we will examine an instance of instrumental conditioning.
We will only need to update the simulate_network function by specifying how activity in the network should be mapped to actions.
Our approach will be to assume that the network emits a response – e.g., presses a lever like a rat in an instrumental conditioning chamber – completely randomly.

def simulate_network_inst(update_weight_func):
    global trl, r_obtained, r_predicted

    for j in range(n_trials - 1):
        trl = j

        for i in range(1, n):

            dt = t[i] - t[i - 1]

            # external input
            dgdt = (-g[i - 1] + psp_amp * spike[i - 1]) / psp_decay
            g[i] = g[i - 1] + dgdt * dt

            # neuron 1
            dvdt1 = (k * (v1[i - 1] - vr) *
                     (v1[i - 1] - vt) - u1[i - 1] + w_01[trl] * g[i - 1]) / C
            dudt1 = a * (b * (v1[i - 1] - vr) - u1[i - 1])
            dgdt1 = (-g1[i - 1] + psp_amp * spike1[i - 1]) / psp_decay
            v1[i] = v1[i - 1] + dvdt1 * dt
            u1[i] = u1[i - 1] + dudt1 * dt
            g1[i] = g1[i - 1] + dgdt1 * dt
            if v1[i] >= vpeak:
                v1[i - 1] = vpeak
                v1[i] = c
                u1[i] = u1[i] + d
                spike1[i] = 1

            # neuron 2
            dvdt2 = (k * (v2[i - 1] - vr) *
                     (v2[i - 1] - vt) - u2[i - 1] + w_12[trl] * g1[i - 1]) / C
            dudt2 = a * (b * (v2[i - 1] - vr) - u2[i - 1])
            dgdt2 = (-g2[i - 1] + psp_amp * spike2[i - 1]) / psp_decay
            v2[i] = v2[i - 1] + dvdt2 * dt
            u2[i] = u2[i - 1] + dudt2 * dt
            g2[i] = g2[i - 1] + dgdt2 * dt
            if v2[i] >= vpeak:
                v2[i - 1] = vpeak
                v2[i] = c
                u2[i] = u2[i] + d
                spike2[i] = 1
                
        # press lever / earn reward on a random 25% of all trials
        if np.random.uniform(0, 1) > 0.25:
            r_obtained[trl] = 1

        # update synaptic weights
        delta_w = update_weight_func()
        w_12[trl + 1] = w_12[trl] + delta_w

        # store trial info
        g_record[trl, :] = g
        v1_record[trl, :] = v1
        g1_record[trl, :] = g1
        v2_record[trl, :] = v2
        g2_record[trl, :] = g2
        
    plot_results()
    
        
n_trials = 100
trl = 0

tau = 0.1
T = 100
t = np.arange(0, T, tau)
n = t.shape[0]

C = 50
vr = -80
vt = -25
vpeak = 40
k = 1
a = 0.01
b = -20
c = -55
d = 150

psp_amp = 1e5
psp_decay = 10

g = np.zeros(n)
spike = np.zeros(n)
spike[200:800:20] = 1

alpha = 3e-14
beta = 3e-14
gamma = 0.1

array_dict = init_arrays()

r_predicted = array_dict['r_predicted']
# NOTE: redefine r_obtained to be all zeros, so that the
# network simulation can populate it on the fly
# r_obtained = array_dict['r_obtained']
r_obtained = np.zeros(n_trials)
delta = array_dict['delta']
v1 = array_dict['v1']
u1 = array_dict['u1']
g1 = array_dict['g1']
spike1 = array_dict['spike1']
v2 = array_dict['v2']
u2 = array_dict['u2']
g2 = array_dict['g2']
spike2 = array_dict['spike2']
w_01 = array_dict['w_01']
w_12 = array_dict['w_12']
g_record = array_dict['g_record']
v1_record = array_dict['v1_record']
g1_record = array_dict['g1_record']
v2_record = array_dict['v2_record']
g2_record = array_dict['g2_record']

update_weight_func = update_weight_rl
simulate_network_inst(update_weight_func)