Python Language Companion to Efficiency and Productivity Analysis: Using Copulas in Stochastic Frontier Models - 5 Chapter 5: Modelling Endogeneity in Stochastic Frontier Models

5.1 Exercise 5.1: C2SLS of Spanish diary farm production model.

This exercise implements the C2SLS estimator for a sample of 137 milk farms in Northern Spain during 1999-2010.

import pandas as pd
import numpy as np 
import statsmodels.api as sm

# load data
dairyfarms = pd.read_excel('spanishdairy.xls', usecols=['DEPREC', 'FARMEXP', 'MILK', 
                                                        'LABOR', 'COWS', 'FEED', 'LAND', 
                                                        'YEAR', 'COOP', 'PFEED', 'PMILK', 'LANDOWN'])
dairyfarms = dairyfarms.astype(float)

# Drop farms with zero roughage
data = dairyfarms[(dairyfarms['DEPREC'] != 0) & (dairyfarms['FARMEXP'] != 0)].copy()
data.reset_index(inplace=True)
data['ROUGHAGE'] = data['FARMEXP'] + data['DEPREC']

# scaling data
y = np.log(data['MILK']) - np.log(np.mean(data['MILK']))
X = data[['LABOR', 'COWS', 'FEED', 'LAND', 'ROUGHAGE']]
X = np.log(X) - np.log(np.mean(X))
X = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), X], axis=1)

# create indicator matrix D
D = pd.get_dummies(data['YEAR'], prefix='YEAR', drop_first=True, dtype=int)

# create Z matrix
data['COOP'] = data['COOP'].astype(int)
COOP_dummy = pd.get_dummies(data['COOP'], prefix='COOP', drop_first=False, dtype=int)
COOP_dummy = COOP_dummy[['COOP_303', 'COOP_304', 'COOP_306', 'COOP_313', 'COOP_307', 'COOP_308', 'COOP_310']]

Z = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), 
               X[['LAND']], data[['PMILK', 'PFEED', 'LANDOWN']], 
               COOP_dummy], axis=1)

Z = pd.concat([Z, D], axis=1)

# compute regression coefficients and standard errors
OLS1 = sm.OLS(X[['LABOR']].values, Z.values)
results1 = OLS1.fit()
p1 = results1.params
se1 = np.sqrt(np.diag(results1.cov_params()))
x1h = Z@p1

OLS2 = sm.OLS(X[['COWS']].values, Z.values)
results2 = OLS2.fit()
p2 = results2.params
se2 = np.sqrt(np.diag(results2.cov_params()))
x2h = Z@p2

OLS3 = sm.OLS(X[['FEED']].values, Z.values)
results3 = OLS3.fit()
p3 = results3.params
se3 = np.sqrt(np.diag(results3.cov_params()))
x3h = Z@p3

OLS4 = sm.OLS(X[['LAND']].values, Z.values)
results4 = OLS4.fit()
p4 = results4.params
se4 = np.sqrt(np.diag(results4.cov_params()))
x4h = Z@p4

OLS5 = sm.OLS(X[['ROUGHAGE']].values, Z.values)
results5 = OLS5.fit()
p5 = results5.params
se5 = np.sqrt(np.diag(results5.cov_params()))
x5h = Z@p5

Xh = pd.concat([X['CONST'], x1h, x2h, x3h, x4h, x5h, D], axis = 1)
OLS_Xh = sm.OLS(y.values, Xh.values)
results_Xh = OLS_Xh.fit()
ls2 = results_Xh.params
se2s = np.sqrt(np.diag(results_Xh.cov_params()))
uh2s = y.values - (pd.concat([X,D], axis=1)@ls2).values

# under homoskedastity
v1 = 1/(len(y)-Xh.shape[1])*np.sum(uh2s**2)*np.linalg.inv((Xh.T@Xh))
# under heteroskedasticity
vv = np.zeros((Xh.shape[1], Xh.shape[1]))
for i in range(len(y)):
    uh2si = uh2s[i]**2
    vv += uh2si*Xh.values[i,:].reshape(-1,1)@Xh.values[i,:].reshape(-1,1).T
v2 = len(y)/(len(y)-Xh.shape[1])*np.linalg.inv(Xh.values.T@Xh.values)@vv@np.linalg.inv(Xh.values.T@Xh.values)
correct_se2s_NCH = np.sqrt(np.diag(v1))
correct_se2s_noNCH = np.sqrt(np.diag(v2))

mu3h2SLS = np.mean(uh2s**3)
sige2h2SLS = np.mean(uh2s**2)
sigu2h2SLS = (np.pi/(np.pi - 4)*np.sqrt(np.pi/2)*mu3h2SLS)**(2/3)*(mu3h2SLS<0)
alphah2SLS = ls2[0] + np.sqrt(2/np.pi*sigu2h2SLS)
sigv2h2SLS = sige2h2SLS-(np.pi-2)/np.pi*sigu2h2SLS
siguh2SLS = np.sqrt(sigu2h2SLS)
sigvh2SLS = np.sqrt(sigv2h2SLS)
lamh2SLS = siguh2SLS/sigvh2SLS
sigh2SLS = np.sqrt(sigu2h2SLS+sigv2h2SLS)

C2SLS_beta0 = np.concatenate(([alphah2SLS], ls2[1:]))
C2SLS_sig0 = sigh2SLS
C2SLS_lam0 = lamh2SLS
C2SLS_sigu = sigu2h2SLS
C2SLS_sigv = sigv2h2SLS
C2SLS_ster_beta0 = correct_se2s_NCH
C2SLS_Eu = np.sqrt((2 / np.pi) * sigu2h2SLS)

#Display the results as a table 
results = pd.DataFrame(data = {'Est': np.concatenate([C2SLS_beta0, np.array([C2SLS_sig0, C2SLS_lam0, C2SLS_sigu, C2SLS_sigv, C2SLS_Eu])], axis = 0), 
                               'St.Err':np.concatenate([C2SLS_ster_beta0, np.array([np.nan, np.nan, np.nan, np.nan, np.nan])], axis = 0)},
                       index = ["Const", "Labor", "Cows", "Feed", 
                                "Land", "Rough", "D00", "D01", "D02", 
                                "D03", "D04", "D05", "D06", "D07", 
                                "D08", "D09", "D10", "Sigma", 
                                "Lambda", "Sigma2_u", "Sigma2_v", "E(u)"])
results = results.round(4)
display(results)

/Users/jleu0010/Documents/GitHub/EPA-Copula-SFA-website/venv/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3462: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)

	Est	St.Err
Const	0.0679	0.0255
Labor	0.1246	0.0628
Cows	0.9704	0.0827
Feed	0.0889	0.0472
Land	-0.1136	0.0177
Rough	0.1108	0.0260
D00	0.0093	0.0202
D01	0.0256	0.0221
D02	0.0406	0.0244
D03	0.0163	0.0250
D04	0.0430	0.0256
D05	0.1005	0.0371
D06	0.1320	0.0401
D07	0.1294	0.0412
D08	0.1135	0.0403
D09	0.1031	0.0419
D10	0.1194	0.0405
Sigma	0.2167	NaN
Lambda	1.7735	NaN
Sigma2_u	0.0356	NaN
Sigma2_v	0.0113	NaN
E(u)	0.1506	NaN

# Remove all previous function and variable definitions before the next exercise
%reset

5.2 Exercise 5.2 GMM of Spanish diary farm production model

This exercise implements the GMM estimator for a sample of 137 milk farms in Northern Spain during 1999-2010.

import numpy as np 
import pandas as pd
import statsmodels.api as sm

from scipy import stats
from scipy.optimize import minimize

def AppGMM(coefs, y, X, Z):

    nX = X.shape[1]
    beta = coefs[:nX]
    sig = coefs[-2]
    lam = coefs[-1]  
           
    er = y - X@beta
    m1 = -1 +1/sig**2*er**2;
    
    pdf = stats.norm.pdf(lam/sig*er)
    cdf = stats.norm.cdf(lam/sig*er)
    m2 = er*pdf/(1-cdf)
    
    m3 = np.zeros((len(y), Z.shape[1]))
    for i in range(Z.shape[1]):
        m3[:, i] = 1/sig*Z[:, i]*er + lam*Z[:, i]*pdf/(1-cdf)
    
    mbar = np.concatenate([np.array([np.mean(m1)]), 
                           np.array([np.mean(m2)]), 
                           np.mean(m3, axis=0)])
    mom = np.concatenate([m1.reshape(-1,1), m2.reshape(-1,1), m3], axis=1)
    
    C = mom.T@mom/len(y)
    W = np.linalg.inv(C)
    Q = mbar@W@mbar.T
    
    return Q

def m(coefs, y, X, Z):
    
    nX = X.shape[1]
    beta = coefs[:nX]
    sig = coefs[-2]
    lam = coefs[-1]  
    
    er = y - X@beta
    m1 = -1 +1/sig**2*er**2;
     
    pdf = stats.norm.pdf(lam/sig*er)
    cdf = stats.norm.cdf(lam/sig*er)
    m2 = er*pdf/(1-cdf)
     
    m3 = np.zeros((len(y), Z.shape[1]))
    for i in range(Z.shape[1]):
        m3[:, i] = 1/sig*Z[:, i]*er + lam*Z[:, i]*pdf/(1-cdf)
    
    m = np.concatenate([np.array([np.mean(m1)]), 
                           np.array([np.mean(m2)]), 
                           np.mean(m3, axis=0)])
    
    return m
              
def AppEstimate_GMM(y, X, Xh, Z):
    
    beta0 = [0.0368, 0.1169, 0.9685, 0.1156, -0.1047, 
             0.0911, 0.0096, 0.0244, 0.0354, 
             0.0132, 0.0423, 0.0933, 0.1305, 0.1302, 
             0.1151, 0.1025, 0.1222]
    sig0 = 0.21
    lam0 = 1.81

    theta0 = np.array(beta0 + [sig0, lam0])
    
    bounds = [(None, None) for x in range(len(theta0) - 2)] + [
        (1e-6, np.inf),
        (1e-6, np.inf),
    ]

    # Minimize the negative log-likelihood using numerical optimization.
        # nelder-mead method obtains a solution closer to the MATLAB solution
        # L-BFGS-B method obtaines a better solution (smaller values of the objective). However, this solution is different to te MATLAB solution
    MLE_results = minimize(
        fun=AppGMM,
        x0=theta0,
        method="nelder-mead",
        tol = 1e-6,
        options={"maxiter": 1000},
        args=(y.values, X.values, Z.values),
        bounds=bounds,
    )
    theta = MLE_results.x
    
    nX = Xh.shape[1]
    beta = theta[:nX]
    sig = theta[-2]
    lam = theta[-1]   
    
    er = y.values - X.values@beta
    m1 = -1 +1/sig**2*er**2;
    
    pdf = stats.norm.pdf(lam/sig*er)
    cdf = stats.norm.cdf(lam/sig*er);
    m2 = er*pdf/(1-cdf)
    
    m3 = np.zeros((len(y), Z.shape[1]))
    for i in range(Z.shape[1]):
        m3[:, i] = 1/sig*Z.values[:, i]*er + lam*Z.values[:, i]*pdf/(1-cdf)
    
    mom = np.concatenate([m1.reshape(-1,1), m2.reshape(-1,1), m3], axis=1)
    C = mom.T@mom/len(y)
    W = np.linalg.inv(C)
        
    # standard errors
    delta = 1e-1
    grad = np.zeros((25, len(theta)))                                                  
    for i in range(len(theta)):
        theta1 = np.copy(theta)
        theta1[i] += delta
        grad[:,i] = (m(theta1, y.values, X.values, Z.values) - m(theta,  y.values, X.values, Z.values))/delta
    OPG = grad.T@W@grad 
    Vmle = np.diag(np.linalg.inv(OPG)/len(y))
    
    return theta, Vmle

# load data
dairyfarms = pd.read_excel('spanishdairy.xls', usecols=['DEPREC', 'FARMEXP', 'MILK', 
                                                        'LABOR', 'COWS', 'FEED', 'LAND', 
                                                        'YEAR', 'COOP', 'PFEED', 'PMILK', 'LANDOWN'])

# Drop farms with zero roughage
data = dairyfarms[(dairyfarms['DEPREC'] != 0) & (dairyfarms['FARMEXP'] != 0)].copy()
data.reset_index(inplace=True)
data['ROUGHAGE'] = data['FARMEXP'] + data['DEPREC']

#scaling to have unit means:
# scaling data
y = np.log(data['MILK']) - np.log(np.mean(data['MILK']))
X = data[['LABOR', 'COWS', 'FEED', 'LAND', 'ROUGHAGE']]
X = np.log(X) - np.repeat(np.log(np.mean(X.values, axis=0)).reshape(1,-1), len(y), axis=0)
X = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), X], axis=1)

# create indicator matrix D
D = pd.get_dummies(data['YEAR'], prefix='YEAR', drop_first=True, dtype=int)

# create Z matrix
data['COOP'] = data['COOP'].astype(int)
COOP_dummy = pd.get_dummies(data['COOP'], prefix='COOP', drop_first=False, dtype=int)
COOP_dummy = COOP_dummy[['COOP_303', 'COOP_304', 'COOP_306', 'COOP_313', 'COOP_307', 'COOP_308', 'COOP_310']]

Z = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), 
               X[['LAND']], data[['PMILK', 'PFEED', 'LANDOWN']], 
               COOP_dummy], axis=1)

Z = pd.concat([Z, D], axis=1)

# compute regression coefficients and standard errors
OLS1 = sm.OLS(X[['LABOR']].values, Z.values)
results1 = OLS1.fit()
p1 = results1.params
se1 = np.sqrt(np.diag(results1.cov_params()))
x1h = Z@p1

OLS2 = sm.OLS(X[['COWS']].values, Z.values)
results2 = OLS2.fit()
p2 = results2.params
se2 = np.sqrt(np.diag(results2.cov_params()))
x2h = Z@p2

OLS3 = sm.OLS(X[['FEED']].values, Z.values)
results3 = OLS3.fit()
p3 = results3.params
se3 = np.sqrt(np.diag(results3.cov_params()))
x3h = Z@p3

OLS4 = sm.OLS(X[['LAND']].values, Z.values)
results4 = OLS4.fit()
p4 = results4.params
se4 = np.sqrt(np.diag(results4.cov_params()))
x4h = Z@p4

OLS5 = sm.OLS(X[['ROUGHAGE']].values, Z.values)
results5 = OLS5.fit()
p5 = results5.params
se5 = np.sqrt(np.diag(results5.cov_params()))
x5h = Z@p5
    
Xh = pd.concat([X['CONST'], x1h, x2h, x3h, x4h, x5h, D], axis = 1)

X = pd.concat([X, D], axis=1)

[GMM_coefs, GMM_vars] = AppEstimate_GMM(y, X, Xh, Z)
GMM_beta0 = GMM_coefs[:-2]
GMM_sig0 = GMM_coefs[-2]
GMM_lam0 = GMM_coefs[-1]
GMM_sigv = (GMM_sig0/np.sqrt(1+GMM_lam0**2))**2
GMM_sigu = GMM_sig0**2-GMM_sigv
GMM_Eu = np.sqrt((2/np.pi)*GMM_sigu)
GMM_ster = np.sqrt(GMM_vars)
GMM_ster_beta0 = GMM_ster[:-2]
GMM_sig0_ster = GMM_ster[-2]
GMM_lam0_ster = GMM_ster[-1]
    

#Display the results as a table 
results = pd.DataFrame(data = {'Est': np.concatenate([GMM_beta0, 
                                                      np.array([GMM_sig0, GMM_lam0, GMM_sigu, GMM_sigv, GMM_Eu])], 
                                                     axis = 0), 
                               'St.Err':np.concatenate([GMM_ster_beta0, np.array([GMM_sig0_ster, GMM_lam0_ster]), np.array([np.nan, np.nan, np.nan])], axis = 0)},
                       index = ["Const", "Labor", "Cows", "Feed", 
                                "Land", "Rough", "D00", "D01", "D02", 
                                "D03", "D04", "D05", "D06", "D07", 
                                "D08", "D09", "D10", "Sigma", 
                                "Lambda", "Sigma2_u", "Sigma2_v", "E(u)"])
results = results.round(4)
display(results)

	Est	St.Err
Const	0.0485	0.0249
Labor	0.2545	0.0528
Cows	0.8541	0.0698
Feed	0.0506	0.0401
Land	-0.1031	0.0163
Rough	0.1336	0.0212
D00	0.0015	0.0225
D01	-0.0152	0.0223
D02	0.0284	0.0228
D03	0.0127	0.0243
D04	0.0219	0.0257
D05	0.1292	0.0346
D06	0.1670	0.0374
D07	0.2048	0.0379
D08	0.1640	0.0377
D09	0.1584	0.0386
D10	0.1732	0.0367
Sigma	0.2200	0.0320
Lambda	1.7594	0.1766
Sigma2_u	0.0366	NaN
Sigma2_v	0.0118	NaN
E(u)	0.1526	NaN

# Remove all previous function and variable definitions before the next exercise
%reset

5.3 Exercise 5.3: LIML-1 of Spanish diary farm production model

This exercise implements the LIML-1 estimator for a sample of 137 milk farms in Northern Spain during 1999-2010.

import numpy as np 
import pandas as pd
import statsmodels.api as sm

from scipy import stats
from scipy.optimize import minimize

def fill_lower_triangular_columnwise(matrix, values):
    val_index = 0
    for j in range(matrix.shape[1]):
        for i in range(j, matrix.shape[0]):
            if val_index < len(values):
                matrix[i, j] = values[val_index]
                val_index += 1
    return matrix 

def lnL1_4_2(coefs, y, X, etas):
    
    nX = X.shape[1]
    beta = coefs[:nX]  
    sig = coefs[-2]
    lam = coefs[-1]
    sigve = coefs[nX:-2]
    sigee = 1/len(y)*(etas.T@etas)
                         
    correction = (np.linalg.lstsq(sigee, sigve.T, rcond=None)[0])@sigve.T
    lamc = lam/np.sqrt(1-(1+lam**2)/sig**2*correction)
    sigc = np.sqrt(sig**2-correction) 
    muci = (np.linalg.lstsq(sigee, sigve.T, rcond=None)[0])@etas.T
    er = y - X@beta - muci
    l3 = np.log(stats.norm.cdf(-lamc/sigc*er))
    
    lnL1 = -0.5*np.log(sigc**2)-0.5*(1/(sigc**2)*(er**2))+l3
    
    return lnL1

def lnL2_4_2(sigee, etas):
    
    L2 = np.zeros(len(etas))
    for i in range(len(etas)):
        np.linalg.lstsq(sigee, etas[i,:], rcond=None)[0]
        L2[i] = -0.5*np.log(np.linalg.det(sigee))-0.5*np.linalg.lstsq(sigee, etas[i,:], rcond=None)[0]@etas[i,:].T
    
    return L2

def AppLogDen_LIML(coefs, y, X, Z):
    
    nX = X.shape[1]
    nZ = Z.shape[1]
    
    beta = coefs[:nX]
    p1 = coefs[nX:nX+nZ]
    p2 = coefs[nX+nZ:nX+2*nZ] 
    p3 = coefs[nX+2*nZ:nX+3*nZ]
    p5 = coefs[nX+3*nZ:nX+4*nZ]
    
    sigve = coefs[(nX+4*nZ):(nX+4*nZ+4)]
    sigee = np.zeros((4, 4))
    sigee = fill_lower_triangular_columnwise(matrix=sigee, 
                                             values=coefs[(nX+4*nZ+4):-2])
    sig = coefs[-2]
    lam = coefs[-1]
 
    sigee = sigee + sigee.T-np.diag(np.diag(sigee),0)

    zz = np.concatenate([(Z@p1).reshape(-1,1), (Z@p2).reshape(-1,1), 
                         (Z@p3).reshape(-1,1), (Z@p5).reshape(-1,1)], 
                        axis=1)
    etas = X[:,[1, 2, 3, 5]] - zz
    
    lnL1_4_2_input = np.concatenate([beta, sigve, np.array([sig]), np.array([lam])])
    
    logL1 = lnL1_4_2(lnL1_4_2_input, y, X, etas)
    logL2 = lnL2_4_2(sigee, etas)
    
    den = logL1+logL2
    
    return den

def AppLoglikelihood_LIML(coefs, y, X, Z):
    
    den = AppLogDen_LIML(coefs, y, X, Z)
    
    logL = -np.sum(den)
    
    return logL

def AppEstimate_LIML(y, X, Xh, Z):
    
    beta0 = [0.0368, 0.1169, 0.9685, 0.1156, -0.1047, 
             0.0911, 0.0096, 0.0244, 0.0354, 
             0.0132, 0.0423, 0.0933, 0.1305, 0.1302, 
             0.1151, 0.1025, 0.1222]
    
    pi0 =[-2.4855, 0.3611, 5.8886, 0.0099, -0.0004, 0.3254, 0.2840, 0.3871, 
          0.2345, 0.4246, 0.2559, -0.0259, 0.0354, -0.1946, 0.0931, 0.1653, 
          0.0441, -0.0607, -0.0133, -0.3113, -0.4158, -0.0276, 0.0979, -3.2824, 
          0.5151, 7.2669, -0.3733, -0.0010, 0.2991, 0.2543, 0.2994, 0.3148, 
          0.3958, 0.1797, 0.1222, 0.0900, -0.0557, 0.3615, 0.4272, 0.3492, 
          0.4799, 0.6147, 0.2772, 0.1449, 0.6324, 0.7493, -4.5369, 0.5496, 
          11.5406, -2.3140, -0.0002, 0.1831, 0.2777, 0.2762, 0.2423, 0.3778, 
          0.0484, -0.0042, 0.1155, -0.0888, 0.5038, 0.5813, 0.4908, 0.6293, 
          0.8405, 0.3581, 0.2583, 0.9450, 1.0859, -6.6371, 0.7891, 15.9134, 
          -0.5840, -0.0031, 0.0151, 0.1536, 0.1794, 0.2626, 0.0726, -0.0861, 
          -0.4394, 0.1696, -0.1905, 0.6295, 0.6861, 0.6209, 0.7861, 1.0791, 
          0.3271, -0.0266, 1.0086, 1.2396]

    sigee0 = [0.1171, 0.0641, 0.0736, 0.0832, 0.0815, 0.0921, 0.1065, 0.1664, 
             0.1441, 0.3662]
    sigve0 = [-0.0198, -0.0163, -0.0032, -0.0134]
   
    sig0 = 0.19
    lam0 = 1.27

    theta0 = np.array(beta0 + pi0 + sigve0 + sigee0 + [sig0, lam0])
    
    # Minimize the negative log-likelihood using numerical optimization.
        # nelder-mead method obtains a solution closer to the MATLAB solution
    MLE_results = minimize(
        fun=AppLoglikelihood_LIML,
        x0=theta0,
        method="Nelder-Mead",
        tol = 1e-6,
        options={"maxiter": 10000},
        args=(y.values, X.values, Z.values),
    )
    theta = MLE_results.x
    
    #standard errors
    delta = 1e-10
    grad = np.zeros((len(y), len(theta)))
    for i in range(len(theta)):
        theta1 = np.copy(theta)
        theta1[i] = theta[i] + delta
        grad[:,i] = (AppLogDen_LIML(theta1, y.values, X.values, Z.values) - AppLogDen_LIML(theta, y.values, X.values, Z.values))/delta

    OPG = grad.T@grad 
    
    Vmle = np.diag(np.linalg.inv(OPG))
    
    return theta, Vmle

# load data
dairyfarms = pd.read_excel('spanishdairy.xls', usecols=['DEPREC', 'FARMEXP', 'MILK', 
                                                        'LABOR', 'COWS', 'FEED', 'LAND', 
                                                        'YEAR', 'COOP', 'PFEED', 'PMILK', 'LANDOWN'])

# Drop farms with zero roughage
data = dairyfarms[(dairyfarms['DEPREC'] != 0) & (dairyfarms['FARMEXP'] != 0)].copy()
data.reset_index(inplace=True)
data['ROUGHAGE'] = data['FARMEXP'] + data['DEPREC']

#scaling to have unit means:
# scaling data
y = np.log(data['MILK']) - np.log(np.mean(data['MILK']))
X = data[['LABOR', 'COWS', 'FEED', 'LAND', 'ROUGHAGE']]
X = np.log(X) - np.repeat(np.log(np.mean(X.values, axis=0)).reshape(1,-1), len(y), axis=0)
X = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), X], axis=1)

# create indicator matrix D
D = pd.get_dummies(data['YEAR'], prefix='YEAR', drop_first=True, dtype=int)

# create Z matrix
data['COOP'] = data['COOP'].astype(int)
COOP_dummy = pd.get_dummies(data['COOP'], prefix='COOP', drop_first=False, dtype=int)
COOP_dummy = COOP_dummy[['COOP_303', 'COOP_304', 'COOP_306', 'COOP_313', 'COOP_307', 'COOP_308', 'COOP_310']]

Z = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), 
               X[['LAND']], data[['PMILK', 'PFEED', 'LANDOWN']], 
               COOP_dummy], axis=1)

Z = pd.concat([Z, D], axis=1)

# compute regression coefficients and standard errors
OLS1 = sm.OLS(X[['LABOR']].values, Z.values)
results1 = OLS1.fit()
p1 = results1.params
se1 = np.sqrt(np.diag(results1.cov_params()))
x1h = Z@p1

OLS2 = sm.OLS(X[['COWS']].values, Z.values)
results2 = OLS2.fit()
p2 = results2.params
se2 = np.sqrt(np.diag(results2.cov_params()))
x2h = Z@p2

OLS3 = sm.OLS(X[['FEED']].values, Z.values)
results3 = OLS3.fit()
p3 = results3.params
se3 = np.sqrt(np.diag(results3.cov_params()))
x3h = Z@p3

OLS4 = sm.OLS(X[['LAND']].values, Z.values)
results4 = OLS4.fit()
p4 = results4.params
se4 = np.sqrt(np.diag(results4.cov_params()))
x4h = Z@p4

OLS5 = sm.OLS(X[['ROUGHAGE']].values, Z.values)
results5 = OLS5.fit()
p5 = results5.params
se5 = np.sqrt(np.diag(results5.cov_params()))
x5h = Z@p5
    
Xh = pd.concat([X['CONST'], x1h, x2h, x3h, x4h, x5h, D], axis = 1)

X = pd.concat([X, D], axis=1)

(LIML_coefs, LIML_vars) = AppEstimate_LIML(y, X, Xh, Z)

LIML_beta0 = LIML_coefs[:17]
LIML_sig0 = LIML_coefs[-2]
LIML_lam0 = LIML_coefs[-1]
LIML_sigv = (LIML_sig0/np.sqrt(1+LIML_lam0**2))**2
LIML_sigu = LIML_sig0**2-LIML_sigv
LIML_Eu = np.sqrt((2/np.pi)*LIML_sigu)
LIML_ster = np.sqrt(LIML_vars)
LIML_ster_beta0 = LIML_ster[:17]
LIML_sig0_ster = LIML_ster[-2]
LIML_lam0_ster = LIML_ster[-1]

#Display the results as a table 
results = pd.DataFrame(data = {'Est': np.concatenate([LIML_beta0, 
                                                      np.array([LIML_sig0, LIML_lam0, LIML_sigu, LIML_sigv, LIML_Eu])], 
                                                     axis = 0), 
                               'St.Err':np.concatenate([LIML_ster_beta0, np.array([LIML_sig0_ster, LIML_lam0_ster]), np.array([np.nan, np.nan, np.nan])], axis = 0)},
                       index = ["Const", "Labor", "Cows", "Feed", 
                                "Land", "Rough", "D00", "D01", "D02", 
                                "D03", "D04", "D05", "D06", "D07", 
                                "D08", "D09", "D10", "Sigma", 
                                "Lambda", "Sigma2_u", "Sigma2_v", "E(u)"])
results = results.round(4)
display(results)

	Est	St.Err
Const	0.0334	0.0268
Labor	0.1269	0.0631
Cows	0.9747	0.0811
Feed	0.1066	0.0571
Land	-0.1136	0.0183
Rough	0.0944	0.0279
D00	0.0118	0.0220
D01	0.0204	0.0239
D02	0.0346	0.0264
D03	0.0130	0.0268
D04	0.0384	0.0261
D05	0.0926	0.0379
D06	0.1326	0.0402
D07	0.1329	0.0411
D08	0.1147	0.0402
D09	0.1010	0.0430
D10	0.1270	0.0425
Sigma	0.1895	0.0077
Lambda	1.1973	0.1476
Sigma2_u	0.0212	NaN
Sigma2_v	0.0148	NaN
E(u)	0.1161	NaN

# Remove all previous function and variable definitions before the next exercise
%reset

5.4 Exercise 5.4: LIML-2 of Spanish diary farm production model

This exercise implements the LIML-2 estimator for a sample of 137 milk farms in Northern Spain during 1999-2010.

import numpy as np 
import pandas as pd
import statsmodels.api as sm

from scipy import stats
from scipy.optimize import minimize

def AppLogDen_LIML2(coefs, y, X, etas):
    
    nX = X.shape[1]
    
    beta = coefs[:nX]
    sig = coefs[-2]
    lam = coefs[-1]
    
    sigve = coefs[nX:-2]
    sigee = 1/len(y)*(etas.T@etas)
    correction = (np.linalg.lstsq(sigee, sigve, rcond=None)[0])@sigve.T
    
    lamc = lam/np.sqrt(1-(1+lam**2)/sig**2*correction)
    sigc = np.sqrt(sig**2-correction)
    muci = (np.linalg.lstsq(sigee, sigve, rcond=None)[0])@etas.T

    er = y - X@beta - muci.T
    l3 = np.log(stats.norm.cdf(-lamc/sigc*er));
    lnL1 = -0.5*np.log(sigc**2)-0.5*(1/(sigc**2)*(er**2))+l3
    
    return lnL1

def AppLoglikelihood_LIML2(coefs, y, X, etas):
    
    den = AppLogDen_LIML2(coefs, y, X, etas)
    
    logL = -np.sum(den)
    
    return logL

def AppEstimate_LIML2(y, X, Xh, etas):
    
    beta0 = [0.0368, 0.1169, 0.9685, 0.1156, -0.1047, 
             0.0911, 0.0096, 0.0244, 0.0354, 
             0.0132, 0.0423, 0.0933, 0.1305, 0.1302, 
             0.1151, 0.1025, 0.1222]

    sigve0 = [0.0, 0.0, 0.0, 0.0]
   
    sig0 = 0.17
    lam0 = 1.7

    theta0 = np.array(beta0 + sigve0 + [sig0, lam0])
    
    # Minimize the negative log-likelihood using numerical optimization.
        # nelder-mead method obtains a solution closer to the MATLAB solution
    MLE_results = minimize(
        fun=AppLoglikelihood_LIML2,
        x0=theta0,
        method="Nelder-Mead",
        tol = 1e-6,
        options={"maxiter": 5000},
        args=(y.values, X.values, etas.values),
    )
    theta = MLE_results.x
    
    #standard errors
    delta = 1e-10
    grad = np.zeros((len(y), len(theta)))
    for i in range(len(theta)):
        theta1 = np.copy(theta)
        theta1[i] = theta[i] + delta
        grad[:,i] = (AppLogDen_LIML2(theta1, y.values, X.values, etas.values) - AppLogDen_LIML2(theta, y.values, X.values, etas.values))/delta

    OPG = grad.T@grad 
    
    Vmle = np.diag(np.linalg.inv(OPG))
    
    return theta, Vmle

# load data
dairyfarms = pd.read_excel('spanishdairy.xls', usecols=['DEPREC', 'FARMEXP', 'MILK', 
                                                        'LABOR', 'COWS', 'FEED', 'LAND', 
                                                        'YEAR', 'COOP', 'PFEED', 'PMILK', 'LANDOWN'])

# Drop farms with zero roughage
data = dairyfarms[(dairyfarms['DEPREC'] != 0) & (dairyfarms['FARMEXP'] != 0)].copy()
data.reset_index(inplace=True)
data['ROUGHAGE'] = data['FARMEXP'] + data['DEPREC']

#scaling to have unit means:
# scaling data
y = np.log(data['MILK']) - np.log(np.mean(data['MILK']))
X = data[['LABOR', 'COWS', 'FEED', 'LAND', 'ROUGHAGE']]
X = np.log(X) - np.repeat(np.log(np.mean(X.values, axis=0)).reshape(1,-1), len(y), axis=0)
X = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), X], axis=1)

# create indicator matrix D
D = pd.get_dummies(data['YEAR'], prefix='YEAR', drop_first=True, dtype=int)

# create Z matrix
data['COOP'] = data['COOP'].astype(int)
COOP_dummy = pd.get_dummies(data['COOP'], prefix='COOP', drop_first=False, dtype=int)
COOP_dummy = COOP_dummy[['COOP_303', 'COOP_304', 'COOP_306', 'COOP_313', 'COOP_307', 'COOP_308', 'COOP_310']]

Z = pd.concat([pd.DataFrame(data={'CONST': np.ones((len(X)))}), 
               X[['LAND']], data[['PMILK', 'PFEED', 'LANDOWN']], 
               COOP_dummy], axis=1)

Z = pd.concat([Z, D], axis=1)

# compute regression coefficients and standard errors
OLS1 = sm.OLS(X[['LABOR']].values, Z.values)
results1 = OLS1.fit()
p1 = results1.params
se1 = np.sqrt(np.diag(results1.cov_params()))
x1h = Z@p1
etah1 = X['LABOR'] - x1h

OLS2 = sm.OLS(X[['COWS']].values, Z.values)
results2 = OLS2.fit()
p2 = results2.params
se2 = np.sqrt(np.diag(results2.cov_params()))
x2h = Z@p2
etah2 = X['COWS'] - x2h

OLS3 = sm.OLS(X[['FEED']].values, Z.values)
results3 = OLS3.fit()
p3 = results3.params
se3 = np.sqrt(np.diag(results3.cov_params()))
x3h = Z@p3
etah3 = X['FEED'] - x3h

OLS4 = sm.OLS(X[['LAND']].values, Z.values)
results4 = OLS4.fit()
p4 = results4.params
se4 = np.sqrt(np.diag(results4.cov_params()))
x4h = Z@p4
etah4 = X['LAND'] - x4h

OLS5 = sm.OLS(X[['ROUGHAGE']].values, Z.values)
results5 = OLS5.fit()
p5 = results5.params
se5 = np.sqrt(np.diag(results5.cov_params()))
x5h = Z@p5
etah5 = X['ROUGHAGE'] - x5h
    
Xh = pd.concat([X['CONST'], x1h, x2h, x3h, x4h, x5h, D], axis = 1)

X = pd.concat([X, D], axis=1)

etas = pd.concat([etah1, etah2, etah3, etah5], axis=1)

(LIML2_coefs, LIML2_vars) = AppEstimate_LIML2(y, X, Xh, etas)

LIML2_beta0 = LIML2_coefs[:17]
LIML2_sig0 = LIML2_coefs[-2]
LIML2_lam0 = LIML2_coefs[-1]
LIML2_sigv = (LIML2_sig0/np.sqrt(1+LIML2_lam0**2))**2
LIML2_sigu = LIML2_sig0**2-LIML2_sigv
LIML2_Eu = np.sqrt((2/np.pi)*LIML2_sigu)
LIML2_ster = np.sqrt(LIML2_vars)
LIML2_ster_beta0 = LIML2_ster[:17]
LIML2_sig0_ster = LIML2_ster[-2]
LIML2_lam0_ster = LIML2_ster[-1]

#Display the results as a table 
results = pd.DataFrame(data = {'Est': np.concatenate([LIML2_beta0, 
                                                      np.array([LIML2_sig0, LIML2_lam0, LIML2_sigu, LIML2_sigv, LIML2_Eu])], 
                                                     axis = 0), 
                               'St.Err':np.concatenate([LIML2_ster_beta0, np.array([LIML2_sig0_ster, LIML2_lam0_ster]), np.array([np.nan, np.nan, np.nan])], axis = 0)},
                       index = ["Const", "Labor", "Cows", "Feed", 
                                "Land", "Rough", "D00", "D01", "D02", 
                                "D03", "D04", "D05", "D06", "D07", 
                                "D08", "D09", "D10", "Sigma", 
                                "Lambda", "Sigma2_u", "Sigma2_v", "E(u)"])
results = results.round(4)
display(results)

	Est	St.Err
Const	0.0212	0.0192
Labor	0.1392	0.0451
Cows	0.8295	0.0615
Feed	0.1731	0.0242
Land	-0.0652	0.0128
Rough	0.0883	0.0182
D00	-0.0037	0.0156
D01	0.0365	0.0174
D02	0.0445	0.0190
D03	0.0733	0.0206
D04	0.0636	0.0196
D05	0.1210	0.0277
D06	0.1564	0.0292
D07	0.1546	0.0294
D08	0.1377	0.0284
D09	0.1225	0.0306
D10	0.1470	0.0298
Sigma	0.1848	0.0057
Lambda	1.4775	0.1424
Sigma2_u	0.0234	NaN
Sigma2_v	0.0107	NaN
E(u)	0.1221	NaN

# Remove all previous function and variable definitions before the next exercise
%reset