clear all set seed 20240202 // Random number seed -- use for replicability ******************************************************************************** * Define scenario ******************************************************************************** // HTE, no PTA violations // * Scenario 1a: homogeneous, constant effects, no time-varying covariates or PTA violations * Scenario 1b: heterogeneous (at random), constant effects, no time-varying covariates or PTA violations * Scenario 1c: heterogeneous (large first), constant effects, no time-varying covariates or PTA violations * Scenario 2a: homogeneous, linear effects, no time-varying covariates or PTA violations * Scenario 2b: heterogeneous (at random), linear effects, no time-varying covariates or PTA violations * Scenario 2c: heterogeneous (large first), linear effects, no time-varying covariates or PTA violations // HTE + PTA violations // * Scenario 3a: homogeneous, constant effects, PTA violation * Scenario 3b: heterogeneous (at random), PTA violation * Scenario 3c: heterogeneous (large first), PTA violation * Scenario 3d: homogeneous, linear effects, PTA violation * Scenario 3e: heterogeneous (at random), linear effects, PTA violation * Scenario 3f: heterogeneous (large first), linear effects, PTA violation global scenario "1a" if "$scenario" == "1a" { global het_id ""homogeneous"" // Options: homogenous, random [heterogeneous at random], or large_first global het_time ""constant"" // Options: constant, linear global covar ""no"" // Any covariates; options: yes, no global pta_viol ""no"" // Options: yes, no } if "$scenario" == "1b" { global het_id ""random"" global het_time ""constant"" global covar ""no"" global pta_viol ""no"" } if "$scenario" == "1c" { global het_id ""large_first"" global het_time ""constant"" global covar ""no"" global pta_viol ""no"" } if "$scenario" == "2a" { global het_id ""homogeneous"" global het_time ""linear"" global covar ""no"" global pta_viol ""no"" } if "$scenario" == "2b" { global het_id ""random"" global het_time ""linear"" global covar ""no"" global pta_viol ""no"" } if "$scenario" == "2c" { global het_id ""large_first"" global het_time ""linear"" global covar ""no"" global pta_viol ""no"" } if "$scenario" == "3a" { global het_id ""homogeneous"" global het_time ""constant"" global covar ""yes"" global pta_viol ""yes"" } if "$scenario" == "3b" { global het_id ""random"" global het_time ""constant"" global covar ""yes"" global pta_viol ""yes"" } if "$scenario" == "3c" { global het_id ""large_first"" global het_time ""constant"" global covar ""yes"" global pta_viol ""yes"" } if "$scenario" == "3d" { global het_id ""homogeneous"" global het_time ""linear"" global covar ""yes"" global pta_viol ""yes"" } if "$scenario" == "3e" { global het_id ""random"" global het_time ""linear"" global covar ""yes"" global pta_viol ""yes"" } if "$scenario" == "3f" { global het_id ""large_first"" global het_time ""linear"" global covar ""yes"" global pta_viol ""yes"" } ******************************************************************************** * Program to run DiD estimators ******************************************************************************** cap program drop simulate_DiD program define simulate_DiD, rclass drop _all local units = 30 // number of units local time = 36 // number of periods local c = 6 // number of cohorts local obsv = `units' * `time' set obs `obsv' egen id = seq(), b(`time') local start = 1 local end = `time' egen t = seq(), f(`start') t(`end') sort id t xtset id t gen D = 0 // treatment turned on gen cohort = . // treatment cohort gen effect = . // treatment effect size gen timing = . // when the treatment happens for each cohort (uncensored) gen first_treat = . // when the treatment happens for each cohort (censored) gen rel_time = . // time - first_treat * Set variables for scenario local het_id = $het_id local het_time = $het_time local covar = $covar // local confound = $confound local pta_viol = $pta_viol * Assign id to cohorts levelsof id, local(lvls) foreach x of local lvls { local chrt = runiformint(1,`c') replace cohort = `chrt' if id == `x' } * Assign time of first treatment levelsof cohort, local(lvls) foreach x of local lvls { local timing = runiformint(`start',`end' + 10) replace timing = `timing' if cohort == `x' replace first_treat = `timing' if cohort == `x' replace first_treat = . if first_treat > `end' replace D = 1 if cohort==`x' & t >= `timing' // D is real time treatment status } replace rel_time = t - first_treat // Assign treatment effect * Scenario a: Homogenous effects (across cohorts) if "`het_id'" == "homogeneous" { // note: this syntax only works inside of a program local min_eff = 2 // minimum effect local max_eff = 10 // maximum effect local eff = runiformint(`min_eff', `max_eff') replace effect = `eff' } * Scenario b: Heterogeneous effects (random by cohort) if "`het_id'" == "random" { levelsof cohort, local(lvls) foreach x of local lvls { local min_eff = 2 // minimum effect local max_eff = 10 // maximum effect local eff = runiformint(`min_eff', `max_eff') replace effect = `eff' if cohort==`x' } } * Scenario c: Heterogeneous effects (large first) gsort -first_treat cohort id t if "`het_id'" == "large_first" { levelsof first_treat, local(lvls) miss local min_eff = 6 // minimum effect local max_eff = 10 // maximum effect local eff = runiformint(`min_eff', `max_eff') local i 0 foreach x of local lvls { local ++i replace effect = `eff' if first_treat ==`x' & `i'== 1 replace effect = `eff' - 1 if first_treat ==`x' & `i'== 2 replace effect = `eff' - 2 if first_treat ==`x' & `i'== 3 replace effect = `eff' - 3 if first_treat ==`x' & `i'== 4 replace effect = `eff' - 4 if first_treat ==`x' & `i'== 5 replace effect = `eff' - 5 if first_treat ==`x' & `i'== 6 } } * Scenario 1: Constant effect if "`het_time'" == "constant" { replace effect = effect // no change needed } * Scenario 2: Linear trend of effect if "`het_time'" == "linear" { gen trend = (t - first_treat + 1)/`end' if D == 1 // magnitude of linear trend replace effect = effect * (t - first_treat + 1)/`end' if D == 1 } * replace effect to 0 for never-treated replace effect = 0 if first_treat == . * Create unit and time FE bys t: gen time_fe = rnormal() if _n == 1 bys t: replace time_fe = time_fe[_n-1] if _n > 1 // randomly assign a time FE for each time period bys id: gen unit_fe = rnormal() if _n == 1 bys id: replace unit_fe = unit_fe[_n-1] if _n > 1 // randomly assign a unit FE for each unit * Create other variables bys id: gen x = runiform(0, 1) if _n == 1 // change rnorml to runiform to avoid negative x bys id: replace x = x[_n-1] if _n > 1 // assume covariate is time-invariant gen e = rnormal() * Covariates if "`covar'" == "no" { local gamma = 0 } * Scenario 3: PTA violation: use time-invariant covariates with time-varying effects for DGP if "`covar'" == "yes" & "`pta_viol'" == "yes" { local gamma = 1 // gen pta_trend = `gamma' * t * x * `i'/(`max_eff'*2) if first_treat < . // magnitude of PTA violation } * Generate y0, y1, y gen y0 = . gsort -first_treat cohort id t levelsof first_treat, local(lvls) miss local i 0 foreach x of local lvls { local ++i replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 1 replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 2 replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 3 replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 4 replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 5 replace y0 = `gamma' * t * x * `i'/(`max_eff'*2) + unit_fe + time_fe + e if first_treat == `x' & `i' == 6 // let slopes vary by initial treatment timing: early treated cohorts have flatter slope } *gen y0 = `gamma' * x + unit_fe + time_fe + e gen y1 = y0 + effect gen y = D * y1 + (1 - D) * y0 sort id t * True effect (ATT) ******************** /* Aggregating ATTs: take a weighted average of the causal effects, where the weights correspond to the number of units treated at each time point*/ sort id rel_time egen att_sum_all = total(effect) if rel_time >= 0 & rel_time != . egen t_sum_all = count(id) if rel_time >= 0 & rel_time != . gen beta_c = att_sum_all / t_sum_all sum beta_c * store estiamtes return scalar beta_true = r(mean) return scalar sd_true = r(sd) * Two-way fixed effects ************************ reghdfe y D x, a(id t) vce(cluster id) return scalar beta_twfe = _b[D] return scalar se_twfe = _se[D] * CS DiD ********* gen timetreatvar = cond(missing(first_treat), 0, first_treat) // 0 for never-D, positive value = which year a group was initially treated csdid y x, ivar(id) time(t) gvar(timetreatvar) notyet // default cluster by indiv estat simple // this is the way how the above true effect is calculated; return scalar beta_cs = r(b)[1,1] return scalar se_cs = r(V)[1,1] * BJS DiD ********** did_imputation y id t first_treat, autosample controls(x) // default cluster by indiv return scalar beta_bjs = _b[tau] return scalar se_bjs = _se[tau] * SA DiD ********** staggered y, i(id) t(t) g(timing) estimand(simple) sa return scalar beta_sa = _b[timing] return scalar se_sa = _se[timing] * WOOLDRIDGE (2021) ********** jwdid y x, ivar(id) time(t) gvar(timetreatvar) estat simple return scalar beta_jw = r(b)[1,1] return scalar se_jw = r(V)[1,1] end ******************************************************************************** * Simulations ******************************************************************************** simulate_DiD * View stored results in rclass return list * Simulation code simulate beta_true = r(beta_true) /// beta_twfe = r(beta_twfe) /// beta_cs = r(beta_cs) /// beta_bjs = r(beta_bjs) /// beta_sa = r(beta_sa) /// beta_jw = r(beta_jw), /// reps(500) seed(20240202): simulate_DiD * Save estimates as a separate data set save "data/sim_estimates_$scenario.dta", replace