require 'csv' class UsdaImporter FILES = { abbreviated: { filename: 'ABBREV.txt', columns: [ 'NDB_No', 'Shrt_Desc', 'Water', 'Energ_Kcal', 'Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD', 'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium', 'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin', 'Riboflavin', 'Niacin', 'Panto_acid', 'Vit_B6', 'Folate_Tot', 'Folic_acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12', 'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot', 'Beta_Crypt', 'Lycopene', 'Lut+Zea', 'Vit_E', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1', 'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct' ], map: { ndbn: 'NDB_No', short_description: 'Shrt_Desc', water: 'Water', kcal: 'Energ_Kcal', protein: 'Protein', lipid: 'Lipid_Tot', ash: 'Ash', carbohydrates: 'Carbohydrt', fiber: 'Fiber_TD', sugar: 'Sugar_Tot', gram_weight_1: 'GmWt_1', gram_weight_2: 'GmWt_2', gram_weight_desc_1: 'GmWt_Desc1', gram_weight_desc_2: 'GmWt_Desc2', refuse_percent: 'Refuse_Pct' } }, food_data: { filename: 'FOOD_DES.txt', columns: [ 'NDB_No', 'FdGrp_Cd', 'Long_Desc', 'Shrt_Desc', 'ComName', 'ManufacName', 'Survey', 'Ref_desc', 'Refuse', 'SciName', 'N_Factor', 'Pro_Factor', 'Fat_Factor', 'CHO_Factor' ], map: { scientific_name: 'SciName', refuse_description: 'Ref_desc', long_description: 'Long_Desc' } }, weights: { filename: 'WEIGHT.txt', map_into: 'usda_food_weights', columns: [ 'NDB_No', 'Seq', 'Amount', 'Msre_Desc', 'Gm_Wgt', 'Num_Data_Pts', 'Std_Dev' ], map: { amount: 'Amount', description: 'Msre_Desc', gram_weight: 'Gm_Wgt' } } } def initialize(directory) @directory = directory end def import UsdaFoodWeight.delete_all UsdaFood.delete_all sorted_files = {} opened_files = {} # Sort each file by NDB_No FILES.each do |name, data| filename = File.join(@directory, data[:filename]) sorted_filename = "#{filename}.sorted" idx = data[:columns].index('NDB_No') if idx idx += 1 `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}` end sorted_files[name] = sorted_filename end begin sorted_files.each do |name, filename| data = FILES[name] opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data[:columns])) end build_enumerator(opened_files).each_slice(500) do |slice| UsdaFood.transaction do slice.each do |data| food = UsdaFood.new data.each do |name, rows| file_info = FILES[name] obj = food if file_info[:map_into] obj = food.send(file_info[:map_into]).build end rows.each do |row| file_info[:map].each do |db, col| obj.send("#{db}=", row[col]) end end end food.save! end end end ensure opened_files.each { |k, v| v.close } sorted_files.each { |k, v| `rm #{v}` } end # UsdaFood.delete_all # # food_data_lookup = {} # # CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv| # csv.each do |row| # food_data_lookup[row['NDB_No']] = row.to_h # end # end # # CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv| # csv.each_slice(500) do |slice| # UsdaFood.transaction do # # attributes = slice.map do |row| # attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }] # lookup = food_data_lookup[attrs[:ndbn]] # if lookup # extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }] # attrs.merge!(extra_attrs) # end # attrs # end # # UsdaFood.create(attributes) # # end # end # end # # usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }] # # Ingredient.where('ndbn IS NOT NULL').each do |i| # item = usda_items[i.ndbn] # # if item # i.set_usda_food(item) # i.save # end # end end def build_enumerator(opened_files) enumerate_data = {} opened_files.each do |name, csv| csv_enumerator = csv.each enumerate_data[name] = { enumerator: csv_enumerator, done: false, next_ndbn: csv_enumerator.peek['NDB_No'] } end Enumerator.new do |yielder| loop do break if enumerate_data.values.all? { |d| d[:done] } current_ndbn = enumerate_data.values.map { |d| d[:next_ndbn] }.min results = Hash.new { |hash, key| hash[key] = [] } enumerate_data.each do |name, data| unless data[:done] begin while data[:next_ndbn] == current_ndbn results[name] << data[:enumerator].next data[:next_ndbn] = data[:enumerator].peek['NDB_No'] end rescue StopIteration data[:done] = true end end end yielder << results end end end def csv_options(headers) { col_sep: '^', quote_char: '~', headers: headers } end end