require 'csv' class UsdaImporter FILES = { abbreviated: { filename: 'ABBREV.txt', columns: [ 'NDB_No', 'Shrt_Desc', 'Water', 'Energ_Kcal', 'Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD', 'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium', 'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin', 'Riboflavin', 'Niacin', 'Panto_acid', 'Vit_B6', 'Folate_Tot', 'Folic_acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12', 'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot', 'Beta_Crypt', 'Lycopene', 'Lut+Zea', 'Vit_E', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1', 'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct' ], map: { ndbn: 'NDB_No', short_description: 'Shrt_Desc', water: 'Water', kcal: 'Energ_Kcal', protein: 'Protein', lipid: 'Lipid_Tot', ash: 'Ash', carbohydrates: 'Carbohydrt', fiber: 'Fiber_TD', sugar: 'Sugar_Tot', gram_weight_1: 'GmWt_1', gram_weight_2: 'GmWt_2', gram_weight_desc_1: 'GmWt_Desc1', gram_weight_desc_2: 'GmWt_Desc2', refuse_percent: 'Refuse_Pct' } }, food_data: { filename: 'FOOD_DES.txt', columns: [ 'NDB_No', 'FdGrp_Cd', 'Long_Desc', 'Shrt_Desc', 'ComName', 'ManufacName', 'Survey', 'Ref_desc', 'Refuse', 'SciName', 'N_Factor', 'Pro_Factor', 'Fat_Factor', 'CHO_Factor' ], map: { scientific_name: 'SciName', refuse_description: 'Ref_desc', long_description: 'Long_Desc' } }, weights: { filename: 'WEIGHT.txt', map_into: 'usda_food_weights', columns: [ 'NDB_No', 'Seq', 'Amount', 'Msre_Desc', 'Gm_Wgt', 'Num_Data_Pts', 'Std_Dev' ], map: { amount: 'Amount', description: 'Msre_Desc', gram_weight: 'Gm_Wgt' } } } def initialize(directory) @directory = directory end def import UsdaFoodWeight.delete_all UsdaFood.delete_all sorted_files = {} opened_files = {} # Sort each file by NDB_No FILES.each do |name, data| filename = File.join(@directory, data[:filename]) sorted_filename = "#{filename}.sorted" idx = data[:columns].index('NDB_No') if idx idx += 1 `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}` end sorted_files[name] = sorted_filename end begin sorted_files.each do |name, filename| data = FILES[name] opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data[:columns])) end build_enumerator(opened_files).each_slice(500) do |slice| UsdaFood.transaction do slice.each do |data| food = UsdaFood.new data.each do |name, rows| file_info = FILES[name] obj = food rows.each do |row| if file_info[:map_into] obj = food.send(file_info[:map_into]).build end file_info[:map].each do |db, col| obj.send("#{db}=", row[col]) end end end food.save! end end end ensure opened_files.each { |k, v| v.close } sorted_files.each { |k, v| `rm #{v}` } end Ingredient.where('ndbn != ?', '').where('ndbn IS NOT NULL').each do |i| i.set_usda_food(i.usda_food) i.save! end end def build_enumerator(opened_files) enumerate_data = {} opened_files.each do |name, csv| csv_enumerator = csv.each enumerate_data[name] = { enumerator: csv_enumerator, done: false, next_ndbn: csv_enumerator.peek['NDB_No'] } end Enumerator.new do |yielder| loop do break if enumerate_data.values.all? { |d| d[:done] } current_ndbn = enumerate_data.values.map { |d| d[:next_ndbn] }.min results = Hash.new { |hash, key| hash[key] = [] } enumerate_data.each do |name, data| unless data[:done] begin while data[:next_ndbn] == current_ndbn results[name] << data[:enumerator].next data[:next_ndbn] = data[:enumerator].peek['NDB_No'] end rescue StopIteration data[:done] = true end end end yielder << results end end end def csv_options(headers) { col_sep: '^', quote_char: '~', headers: headers } end end