******************************************************* *This do file contains validation code for linked CPS *files in wide format. For details see below * *Written by the IPUMS-CPS team *31 May 2018 ******************************************************* /*************************************************************************************************************** This validation do file will validate links based on AGE, SEX, and RACE for any linked wide file where the following assumptions are met: 1. All nonlinks based on CPSIDP are dropped before validation 2. Time-varying variables are labled consistent with naming conventions used by stata's reshape function, with '[#]' for each time observed (i.e. sex1, sex2). This must be a generic time variable, not a value of MONTH or MISH 3. Both MONTH and YEAR have a variable for each observed time point (i.e. year1 and year2) 4. Only matches across all observations of AGE, SEX, and RACE are considered valid The user must specify the number of expected observations as an argument to the do file (do validate_long.txt 2). If a user wishes to add other validation variables to the basic AGE, SEX, and RACE, these may be added as an argument as well (i.e. do validate_wide_test.txt 2 marst - this will validate a linked file with two time observations on AGE, SEX, RACE, and MARST). If you want to use this validation code, no changes are required. Simply refer to this do file from within another do file. For example, if you want to validate links across two time points: do validate_wide.txt 2 ******************************************************************************************************************/ local args `0' local expected_times `1' local not `expected_times' local _validation_vars: list args-not local simple_validation_vars sex race `validation_vars' di "`expected_times'" di "`simple_validation_vars'" /******************************************************************************************************** Users should not need to alter anything below here unless they wish to tweak the validation requirements ********************************************************************************************************/ /*Step 0 -- check to make sure that number of times individuals will be observed in data (as defined by the user) is between 2 and 8*/ //make sure a valid number of observations has either been derived or specified if missing("`expected_times'"){ di "!!!!!Please specify the number of time points you wish to validate." } if `expected_times' < 2 | `expected_times' > 8 { di "!!!!!Number of expected observations must be between 2 and 8! Value is currently `expected_obs' (years * months). Please set above in 'local expected_times'!!!!!" exit } /*Step 1*/ /*Step 1a -- start with the easy vars -- SEX and RACE*/ foreach var in `simple_validation_vars'{ gen `var'_match = 1 forvalues i = 1/`expected_times'{ if `i' != 1{ replace `var'_match = 0 if `var'_`i' != `var'_1 if `var'_match==0 { continue, break } } } } /*Step 1b -- now onto AGE which is more complicated*/ gen allowable_age_diff=. /*Rule 1: allow a one-year age increase within months 1-4 and 5-8 if mish_1 < 4 and the sum of mish_1 and num of obs <= 4 or mish_1 is between 5 and 8, then 8-month break not in window of observation and plus/minus one year will suffice*/ replace allowable_age_diff = 1 if ((mish_1 >= 1 & mish_1 < 4) & (mish_`expected_times' <= 4)) | (mish_1 >= 5 & mish_1 < 8) /*Rule 2: if mish_1 == 4 | mish_1+4 > 4, then 8-month break needs to be accounted for*/ replace allowable_age_diff = 2 if allowable_age_diff == . & (mish_1 == 4 | (mish_1 < 4 & mish_`expected_times' > 4)) /*Rule 3: account for age topcodes 80 85 */ replace allowable_age_diff = 5 if (allowable_age_diff == 1 | allowable_age_diff == 2) & age_1 == 80 /*Create the "age match" variable*/ gen age_match = 1 forvalues i = 1/`expected_times'{ if `i' != 1{ replace age_match = 0 if age_`i' != age_1 & (age_`i' < age_1 | age_`i' > age_1+allowable_age_diff) if age_match==0 { continue, break } } } /*Create an "all match" variable*/ gen all_match = (age_match==1 & sex_match==1 & race_match==1) count if all_match==1 disp "your next step might be to **keep if all_match==1**"