parsley/lib/usda_importer.rb
2018-09-14 19:32:49 -05:00

478 lines
10 KiB
Ruby

require 'csv'
class UsdaImporter
BRANDED_NUTRIENTS = {
208 => {
col: 'kcal',
unit: 'kcal'
},
203 => {
col: 'protein',
unit: 'g'
},
255 => {
col: 'water',
unit: 'g'
},
204 => {
col: 'lipid',
unit: 'g'
},
207 => {
col: 'ash',
unit: 'g'
},
205 => {
col: 'carbohydrates',
unit: 'g'
},
291 => {
col: 'fiber',
unit: 'g'
},
269 => {
col: 'sugar',
unit: 'g'
},
301 => {
col: 'calcium',
unit: 'mg'
},
303 => {
col: 'iron',
unit: 'mg'
},
304 => {
col: 'magnesium',
unit: 'mg'
},
305 => {
col: 'phosphorus',
unit: 'mg'
},
306 => {
col: 'potassium',
unit: 'mg'
},
307 => {
col: 'sodium',
unit: 'mg'
},
309 => {
col: 'zinc',
unit: 'mg'
},
312 => {
col: 'copper',
unit: 'mg'
},
315 => {
col: 'manganese',
unit: 'mg'
},
401 => {
col: 'vit_c',
unit: 'mg'
},
415 => {
col: 'vit_b6',
unit: 'mg'
},
418 => {
col: 'vit_b12',
unit: 'mcg'
},
318 => {
col: 'vit_a',
unit: 'IU',
convert: ->(x) { x.to_f * 0.3 }
},
324 => {
col: 'vit_d',
unit: 'IU',
convert: ->(x) { x.to_f / 40.0 }
},
430 => {
col: 'vit_k',
unit: 'mcg'
},
601 => {
col: 'cholesterol',
unit: 'mg'
}
}
FILES = {
abbreviated: {
filename: 'ABBREV.txt',
key_column: 'NDB_No',
columns: [
'NDB_No',
'Shrirt_Desc',
'Water',
'Energ_Kcal',
'Protein',
'Lipid_Tot',
'Ash',
'Carbohydrt',
'Fiber_TD',
'Sugar_Tot',
'Calcium',
'Iron',
'Magnesium',
'Phosphorus',
'Potassium',
'Sodium',
'Zinc',
'Copper',
'Manganese',
'Selenium',
'Vit_C',
'Thiamin',
'Riboflavin',
'Niacin',
'Panto_acid',
'Vit_B6',
'Folate_Tot',
'Folic_acid',
'Food_Folate',
'Folate_DFE',
'Choline_Tot',
'Vit_B12',
'Vit_A_IU',
'Vit_A_RAE',
'Retinol',
'Alpha_Carot',
'Beta_Carot',
'Beta_Crypt',
'Lycopene',
'Lut+Zea',
'Vit_E',
'Vit_D_mcg',
'Vit_D_IU',
'Vit_K',
'FA_Sat',
'FA_Mono',
'FA_Poly',
'Cholestrl',
'GmWt_1',
'GmWt_Desc1',
'GmWt_2',
'GmWt_Desc2',
'Refuse_Pct'
],
map: {
ndbn: 'NDB_No',
short_description: 'Shrt_Desc',
water: 'Water',
kcal: 'Energ_Kcal',
protein: 'Protein',
lipid: 'Lipid_Tot',
ash: 'Ash',
carbohydrates: 'Carbohydrt',
fiber: 'Fiber_TD',
sugar: 'Sugar_Tot',
gram_weight_1: 'GmWt_1',
gram_weight_2: 'GmWt_2',
gram_weight_desc_1: 'GmWt_Desc1',
gram_weight_desc_2: 'GmWt_Desc2',
refuse_percent: 'Refuse_Pct',
calcium: 'Calcium',
iron: 'Iron',
magnesium: 'Magnesium',
phosphorus: 'Phosphorus',
potassium: 'Potassium',
sodium: 'Sodium',
zinc: 'Zinc',
copper: 'Copper',
manganese: 'Manganese',
vit_c: 'Vit_C',
vit_b6: 'Vit_B6',
vit_b12: 'Vit_B12',
vit_a: 'Vit_A_RAE',
vit_e: 'Vit_E',
vit_d: 'Vit_D_mcg',
vit_k: 'Vit_K',
cholesterol: 'Cholestrl'
},
static: {
source: 'sr',
nutrient_unit: '100 grams'
}
},
food_data: {
filename: 'FOOD_DES.txt',
key_column: 'NDB_No',
columns: [
'NDB_No',
'FdGrp_Cd',
'Long_Desc',
'Shrt_Desc',
'ComName',
'ManufacName',
'Survey',
'Ref_desc',
'Refuse',
'SciName',
'N_Factor',
'Pro_Factor',
'Fat_Factor',
'CHO_Factor'
],
map: {
scientific_name: 'SciName',
refuse_description: 'Ref_desc',
long_description: 'Long_Desc'
}
},
weights: {
filename: 'WEIGHT.txt',
key_column: 'NDB_No',
map_into: 'usda_food_weights',
columns: [
'NDB_No',
'Seq',
'Amount',
'Msre_Desc',
'Gm_Wgt',
'Num_Data_Pts',
'Std_Dev'
],
map: {
amount: 'Amount',
description: 'Msre_Desc',
gram_weight: 'Gm_Wgt'
}
},
#Branded Food DB files
products: {
filename: 'branded/Products.csv',
key_column: 'NDB_Number',
csv: true,
columns: [
"NDB_Number",
"long_name",
"data_source",
"gtin_upc",
"manufacturer",
"date_modified",
"date_available",
"ingredients_english"
],
map: {
ndbn: 'NDB_Number',
long_description: 'long_name',
manufacturer: 'manufacturer',
ingredients: 'ingredients_english'
},
static: {
source: 'bf',
nutrient_unit: '100 g'
}
},
nutrients: {
filename: 'branded/Nutrients.csv',
key_column: 'NDB_No',
csv: true,
columns: [
"NDB_No",
"Nutrient_Code",
"Nutrient_name",
"Derivation_Code",
"Output_value",
"Output_uom"
],
map_function: ->(obj, row) do
map = BRANDED_NUTRIENTS[row['Nutrient_Code'].to_i]
if map && map[:unit] == row['Output_uom']
obj.send("#{map[:col]}=".to_sym, row['Output_value'])
end
end
},
serving_sizes: {
filename: 'branded/Serving_size.csv',
key_column: 'NDB_No',
csv: true,
map_into: 'usda_food_weights',
columns: [
"NDB_No",
"Serving_Size",
"Serving_Size_UOM",
"Household_Serving_Size",
"Household_Serving_Size_UOM",
"Preparation_State"
],
map: {
amount: 'Household_Serving_Size',
description: 'Household_Serving_Size_UOM',
gram_weight: 'Serving_Size'
}
}
}
def initialize(directory)
@directory = directory
end
def import
UsdaFoodWeight.delete_all
UsdaFood.delete_all
sorted_files = {}
opened_files = {}
# Sort each file by NDB_No
FILES.each do |name, data|
filename = File.join(@directory, data[:filename])
sorted_filename = "#{filename}.sorted"
idx = data[:columns].index(data[:key_column])
if idx
idx += 1
if data[:csv]
`head -n 1 #{filename} > #{sorted_filename}`
`tail -n +2 #{filename} | sort -n -t',' -k#{idx} - >> #{sorted_filename}`
else
`sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
end
end
sorted_files[name] = sorted_filename
end
begin
sorted_files.each do |name, filename|
data = FILES[name]
opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data))
end
build_enumerator(opened_files).each_slice(500) do |slice|
UsdaFood.transaction do
slice.each do |data|
food = UsdaFood.new
data.each do |name, rows|
file_info = FILES[name]
obj = food
rows.each do |row|
if file_info[:map_into]
obj = food.send(file_info[:map_into]).build
end
if file_info[:static]
file_info[:static].each do |k, v|
obj.send("#{k}=", v)
end
end
if file_info[:map_function]
file_info[:map_function].call(obj, row)
else
file_info[:map].each do |db, col|
obj.send("#{db}=", row[col])
end
end
end
end
food.save!
end
end
end
ensure
opened_files.each { |k, v| v.close }
sorted_files.each { |k, v| `rm #{v}` }
end
Food.where('ndbn != ?', '').where('ndbn IS NOT NULL').each do |i|
i.set_usda_food(i.usda_food)
i.save!
end
end
def build_enumerator(opened_files)
enumerate_data = {}
opened_files.each do |name, csv|
file_data = FILES[name]
csv_enumerator = csv.each
enumerate_data[name] = {
enumerator: csv_enumerator,
done: false,
next_ndbn: csv_enumerator.peek[file_data[:key_column]],
peek_next_ndbn: -> { csv_enumerator.peek[file_data[:key_column]] }
}
end
Enumerator.new do |yielder|
loop do
break if enumerate_data.values.all? { |d| d[:done] }
current_ndbn = enumerate_data.select { |_, d| !d[:done] }.values.map { |d| d[:next_ndbn] }.min
results = Hash.new { |hash, key| hash[key] = [] }
enumerate_data.each do |name, data|
unless data[:done]
begin
while data[:next_ndbn] == current_ndbn
results[name] << data[:enumerator].next
data[:next_ndbn] = data[:peek_next_ndbn].call
end
rescue StopIteration
data[:done] = true
end
end
end
yielder << results
end
end
end
def csv_options(data)
if data[:csv]
{ headers: true }
else
{ col_sep: '^', quote_char: '~', headers: data[:columns] }
end
end
end