require 'csv' class UsdaImporter BRANDED_NUTRIENTS = { 208 => { col: 'kcal', unit: 'kcal' }, 203 => { col: 'protein', unit: 'g' }, 255 => { col: 'water', unit: 'g' }, 204 => { col: 'lipid', unit: 'g' }, 207 => { col: 'ash', unit: 'g' }, 205 => { col: 'carbohydrates', unit: 'g' }, 291 => { col: 'fiber', unit: 'g' }, 269 => { col: 'sugar', unit: 'g' }, 301 => { col: 'calcium', unit: 'mg' }, 303 => { col: 'iron', unit: 'mg' }, 304 => { col: 'magnesium', unit: 'mg' }, 305 => { col: 'phosphorus', unit: 'mg' }, 306 => { col: 'potassium', unit: 'mg' }, 307 => { col: 'sodium', unit: 'mg' }, 309 => { col: 'zinc', unit: 'mg' }, 312 => { col: 'copper', unit: 'mg' }, 315 => { col: 'manganese', unit: 'mg' }, 401 => { col: 'vit_c', unit: 'mg' }, 415 => { col: 'vit_b6', unit: 'mg' }, 418 => { col: 'vit_b12', unit: 'mcg' }, 318 => { col: 'vit_a', unit: 'IU', convert: ->(x) { x.to_f * 0.3 } }, 324 => { col: 'vit_d', unit: 'IU', convert: ->(x) { x.to_f / 40.0 } }, 430 => { col: 'vit_k', unit: 'mcg' }, 601 => { col: 'cholesterol', unit: 'mg' } } FILES = { abbreviated: { filename: 'ABBREV.txt', key_column: 'NDB_No', columns: [ 'NDB_No', 'Shrirt_Desc', 'Water', 'Energ_Kcal', 'Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD', 'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium', 'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin', 'Riboflavin', 'Niacin', 'Panto_acid', 'Vit_B6', 'Folate_Tot', 'Folic_acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12', 'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot', 'Beta_Crypt', 'Lycopene', 'Lut+Zea', 'Vit_E', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1', 'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct' ], map: { ndbn: 'NDB_No', short_description: 'Shrt_Desc', water: 'Water', kcal: 'Energ_Kcal', protein: 'Protein', lipid: 'Lipid_Tot', ash: 'Ash', carbohydrates: 'Carbohydrt', fiber: 'Fiber_TD', sugar: 'Sugar_Tot', gram_weight_1: 'GmWt_1', gram_weight_2: 'GmWt_2', gram_weight_desc_1: 'GmWt_Desc1', gram_weight_desc_2: 'GmWt_Desc2', refuse_percent: 'Refuse_Pct', calcium: 'Calcium', iron: 'Iron', magnesium: 'Magnesium', phosphorus: 'Phosphorus', potassium: 'Potassium', sodium: 'Sodium', zinc: 'Zinc', copper: 'Copper', manganese: 'Manganese', vit_c: 'Vit_C', vit_b6: 'Vit_B6', vit_b12: 'Vit_B12', vit_a: 'Vit_A_RAE', vit_e: 'Vit_E', vit_d: 'Vit_D_mcg', vit_k: 'Vit_K', cholesterol: 'Cholestrl' }, static: { source: 'sr', nutrient_unit: '100 grams' } }, food_data: { filename: 'FOOD_DES.txt', key_column: 'NDB_No', columns: [ 'NDB_No', 'FdGrp_Cd', 'Long_Desc', 'Shrt_Desc', 'ComName', 'ManufacName', 'Survey', 'Ref_desc', 'Refuse', 'SciName', 'N_Factor', 'Pro_Factor', 'Fat_Factor', 'CHO_Factor' ], map: { scientific_name: 'SciName', refuse_description: 'Ref_desc', long_description: 'Long_Desc' } }, weights: { filename: 'WEIGHT.txt', key_column: 'NDB_No', map_into: 'usda_food_weights', columns: [ 'NDB_No', 'Seq', 'Amount', 'Msre_Desc', 'Gm_Wgt', 'Num_Data_Pts', 'Std_Dev' ], map: { amount: 'Amount', description: 'Msre_Desc', gram_weight: 'Gm_Wgt' } }, #Branded Food DB files products: { filename: 'branded/Products.csv', key_column: 'NDB_Number', csv: true, columns: [ "NDB_Number", "long_name", "data_source", "gtin_upc", "manufacturer", "date_modified", "date_available", "ingredients_english" ], map: { ndbn: 'NDB_Number', long_description: 'long_name', manufacturer: 'manufacturer', ingredients: 'ingredients_english' }, static: { source: 'bf', nutrient_unit: '100 g' } }, nutrients: { filename: 'branded/Nutrients.csv', key_column: 'NDB_No', csv: true, columns: [ "NDB_No", "Nutrient_Code", "Nutrient_name", "Derivation_Code", "Output_value", "Output_uom" ], map_function: ->(obj, row) do map = BRANDED_NUTRIENTS[row['Nutrient_Code'].to_i] if map && map[:unit] == row['Output_uom'] obj.send("#{map[:col]}=".to_sym, row['Output_value']) end end }, serving_sizes: { filename: 'branded/Serving_size.csv', key_column: 'NDB_No', csv: true, map_into: 'usda_food_weights', columns: [ "NDB_No", "Serving_Size", "Serving_Size_UOM", "Household_Serving_Size", "Household_Serving_Size_UOM", "Preparation_State" ], map: { amount: 'Household_Serving_Size', description: 'Household_Serving_Size_UOM', gram_weight: 'Serving_Size' } } } def initialize(directory) @directory = directory end def import UsdaFoodWeight.delete_all UsdaFood.delete_all sorted_files = {} opened_files = {} # Sort each file by NDB_No FILES.each do |name, data| filename = File.join(@directory, data[:filename]) sorted_filename = "#{filename}.sorted" idx = data[:columns].index(data[:key_column]) if idx idx += 1 if data[:csv] `head -n 1 #{filename} > #{sorted_filename}` `tail -n +2 #{filename} | sort -n -t',' -k#{idx} - >> #{sorted_filename}` else `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}` end end sorted_files[name] = sorted_filename end begin sorted_files.each do |name, filename| data = FILES[name] opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data)) end build_enumerator(opened_files).each_slice(500) do |slice| UsdaFood.transaction do slice.each do |data| food = UsdaFood.new data.each do |name, rows| file_info = FILES[name] obj = food rows.each do |row| if file_info[:map_into] obj = food.send(file_info[:map_into]).build end if file_info[:static] file_info[:static].each do |k, v| obj.send("#{k}=", v) end end if file_info[:map_function] file_info[:map_function].call(obj, row) else file_info[:map].each do |db, col| obj.send("#{db}=", row[col]) end end end end food.save! end end end ensure opened_files.each { |k, v| v.close } sorted_files.each { |k, v| `rm #{v}` } end Food.where('ndbn != ?', '').where('ndbn IS NOT NULL').each do |i| i.set_usda_food(i.usda_food) i.save! end end def build_enumerator(opened_files) enumerate_data = {} opened_files.each do |name, csv| file_data = FILES[name] csv_enumerator = csv.each enumerate_data[name] = { enumerator: csv_enumerator, done: false, next_ndbn: csv_enumerator.peek[file_data[:key_column]], peek_next_ndbn: -> { csv_enumerator.peek[file_data[:key_column]] } } end Enumerator.new do |yielder| loop do break if enumerate_data.values.all? { |d| d[:done] } current_ndbn = enumerate_data.select { |_, d| !d[:done] }.values.map { |d| d[:next_ndbn] }.min results = Hash.new { |hash, key| hash[key] = [] } enumerate_data.each do |name, data| unless data[:done] begin while data[:next_ndbn] == current_ndbn results[name] << data[:enumerator].next data[:next_ndbn] = data[:peek_next_ndbn].call end rescue StopIteration data[:done] = true end end end yielder << results end end end def csv_options(data) if data[:csv] { headers: true } else { col_sep: '^', quote_char: '~', headers: data[:columns] } end end end