2016-01-24 17:10:43 -06:00
|
|
|
require 'csv'
|
|
|
|
|
|
|
|
class UsdaImporter
|
|
|
|
|
2018-09-14 19:32:49 -05:00
|
|
|
BRANDED_NUTRIENTS = {
|
|
|
|
208 => {
|
|
|
|
col: 'kcal',
|
|
|
|
unit: 'kcal'
|
|
|
|
},
|
|
|
|
|
|
|
|
203 => {
|
|
|
|
col: 'protein',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
255 => {
|
|
|
|
col: 'water',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
204 => {
|
|
|
|
col: 'lipid',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
207 => {
|
|
|
|
col: 'ash',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
205 => {
|
|
|
|
col: 'carbohydrates',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
291 => {
|
|
|
|
col: 'fiber',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
269 => {
|
|
|
|
col: 'sugar',
|
|
|
|
unit: 'g'
|
|
|
|
},
|
|
|
|
|
|
|
|
301 => {
|
|
|
|
col: 'calcium',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
303 => {
|
|
|
|
col: 'iron',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
304 => {
|
|
|
|
col: 'magnesium',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
305 => {
|
|
|
|
col: 'phosphorus',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
306 => {
|
|
|
|
col: 'potassium',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
307 => {
|
|
|
|
col: 'sodium',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
309 => {
|
|
|
|
col: 'zinc',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
312 => {
|
|
|
|
col: 'copper',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
315 => {
|
|
|
|
col: 'manganese',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
401 => {
|
|
|
|
col: 'vit_c',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
415 => {
|
|
|
|
col: 'vit_b6',
|
|
|
|
unit: 'mg'
|
|
|
|
},
|
|
|
|
|
|
|
|
418 => {
|
|
|
|
col: 'vit_b12',
|
|
|
|
unit: 'mcg'
|
|
|
|
},
|
|
|
|
|
|
|
|
318 => {
|
|
|
|
col: 'vit_a',
|
|
|
|
unit: 'IU',
|
|
|
|
convert: ->(x) { x.to_f * 0.3 }
|
|
|
|
},
|
|
|
|
|
|
|
|
324 => {
|
|
|
|
col: 'vit_d',
|
|
|
|
unit: 'IU',
|
|
|
|
convert: ->(x) { x.to_f / 40.0 }
|
|
|
|
},
|
|
|
|
|
|
|
|
430 => {
|
|
|
|
col: 'vit_k',
|
|
|
|
unit: 'mcg'
|
|
|
|
},
|
|
|
|
|
|
|
|
601 => {
|
|
|
|
col: 'cholesterol',
|
|
|
|
unit: 'mg'
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
FILES = {
|
|
|
|
abbreviated: {
|
|
|
|
filename: 'ABBREV.txt',
|
2018-09-14 19:32:49 -05:00
|
|
|
key_column: 'NDB_No',
|
2016-03-09 18:53:47 -06:00
|
|
|
columns: [
|
|
|
|
'NDB_No',
|
2018-09-14 19:32:49 -05:00
|
|
|
'Shrirt_Desc',
|
2016-03-09 18:53:47 -06:00
|
|
|
'Water',
|
|
|
|
'Energ_Kcal',
|
|
|
|
'Protein',
|
|
|
|
'Lipid_Tot',
|
|
|
|
'Ash',
|
|
|
|
'Carbohydrt',
|
|
|
|
'Fiber_TD',
|
|
|
|
'Sugar_Tot',
|
|
|
|
'Calcium',
|
|
|
|
'Iron',
|
|
|
|
'Magnesium',
|
|
|
|
'Phosphorus',
|
|
|
|
'Potassium',
|
|
|
|
'Sodium',
|
|
|
|
'Zinc',
|
|
|
|
'Copper',
|
|
|
|
'Manganese',
|
|
|
|
'Selenium',
|
|
|
|
'Vit_C',
|
|
|
|
'Thiamin',
|
|
|
|
'Riboflavin',
|
|
|
|
'Niacin',
|
|
|
|
'Panto_acid',
|
|
|
|
'Vit_B6',
|
|
|
|
'Folate_Tot',
|
|
|
|
'Folic_acid',
|
|
|
|
'Food_Folate',
|
|
|
|
'Folate_DFE',
|
|
|
|
'Choline_Tot',
|
|
|
|
'Vit_B12',
|
|
|
|
'Vit_A_IU',
|
|
|
|
'Vit_A_RAE',
|
|
|
|
'Retinol',
|
|
|
|
'Alpha_Carot',
|
|
|
|
'Beta_Carot',
|
|
|
|
'Beta_Crypt',
|
|
|
|
'Lycopene',
|
|
|
|
'Lut+Zea',
|
|
|
|
'Vit_E',
|
|
|
|
'Vit_D_mcg',
|
|
|
|
'Vit_D_IU',
|
|
|
|
'Vit_K',
|
|
|
|
'FA_Sat',
|
|
|
|
'FA_Mono',
|
|
|
|
'FA_Poly',
|
|
|
|
'Cholestrl',
|
|
|
|
'GmWt_1',
|
|
|
|
'GmWt_Desc1',
|
|
|
|
'GmWt_2',
|
|
|
|
'GmWt_Desc2',
|
|
|
|
'Refuse_Pct'
|
|
|
|
],
|
|
|
|
map: {
|
|
|
|
ndbn: 'NDB_No',
|
|
|
|
short_description: 'Shrt_Desc',
|
|
|
|
water: 'Water',
|
|
|
|
kcal: 'Energ_Kcal',
|
|
|
|
protein: 'Protein',
|
|
|
|
lipid: 'Lipid_Tot',
|
|
|
|
ash: 'Ash',
|
|
|
|
carbohydrates: 'Carbohydrt',
|
|
|
|
fiber: 'Fiber_TD',
|
|
|
|
sugar: 'Sugar_Tot',
|
|
|
|
gram_weight_1: 'GmWt_1',
|
|
|
|
gram_weight_2: 'GmWt_2',
|
|
|
|
gram_weight_desc_1: 'GmWt_Desc1',
|
|
|
|
gram_weight_desc_2: 'GmWt_Desc2',
|
2016-06-22 13:49:03 -05:00
|
|
|
refuse_percent: 'Refuse_Pct',
|
|
|
|
calcium: 'Calcium',
|
|
|
|
iron: 'Iron',
|
|
|
|
magnesium: 'Magnesium',
|
|
|
|
phosphorus: 'Phosphorus',
|
|
|
|
potassium: 'Potassium',
|
|
|
|
sodium: 'Sodium',
|
|
|
|
zinc: 'Zinc',
|
|
|
|
copper: 'Copper',
|
|
|
|
manganese: 'Manganese',
|
|
|
|
vit_c: 'Vit_C',
|
|
|
|
vit_b6: 'Vit_B6',
|
|
|
|
vit_b12: 'Vit_B12',
|
|
|
|
vit_a: 'Vit_A_RAE',
|
|
|
|
vit_e: 'Vit_E',
|
|
|
|
vit_d: 'Vit_D_mcg',
|
|
|
|
vit_k: 'Vit_K',
|
|
|
|
cholesterol: 'Cholestrl'
|
2018-09-14 19:32:49 -05:00
|
|
|
},
|
|
|
|
static: {
|
|
|
|
source: 'sr',
|
|
|
|
nutrient_unit: '100 grams'
|
2016-03-09 18:53:47 -06:00
|
|
|
}
|
|
|
|
},
|
|
|
|
|
|
|
|
food_data: {
|
|
|
|
filename: 'FOOD_DES.txt',
|
2018-09-14 19:32:49 -05:00
|
|
|
key_column: 'NDB_No',
|
2016-03-09 18:53:47 -06:00
|
|
|
columns: [
|
|
|
|
'NDB_No',
|
|
|
|
'FdGrp_Cd',
|
|
|
|
'Long_Desc',
|
|
|
|
'Shrt_Desc',
|
|
|
|
'ComName',
|
|
|
|
'ManufacName',
|
|
|
|
'Survey',
|
|
|
|
'Ref_desc',
|
|
|
|
'Refuse',
|
|
|
|
'SciName',
|
|
|
|
'N_Factor',
|
|
|
|
'Pro_Factor',
|
|
|
|
'Fat_Factor',
|
|
|
|
'CHO_Factor'
|
|
|
|
],
|
|
|
|
map: {
|
|
|
|
scientific_name: 'SciName',
|
|
|
|
refuse_description: 'Ref_desc',
|
|
|
|
long_description: 'Long_Desc'
|
|
|
|
}
|
|
|
|
},
|
2016-01-24 17:10:43 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
weights: {
|
|
|
|
filename: 'WEIGHT.txt',
|
2018-09-14 19:32:49 -05:00
|
|
|
key_column: 'NDB_No',
|
2016-03-09 18:53:47 -06:00
|
|
|
map_into: 'usda_food_weights',
|
|
|
|
columns: [
|
|
|
|
'NDB_No',
|
|
|
|
'Seq',
|
|
|
|
'Amount',
|
|
|
|
'Msre_Desc',
|
|
|
|
'Gm_Wgt',
|
|
|
|
'Num_Data_Pts',
|
|
|
|
'Std_Dev'
|
|
|
|
],
|
|
|
|
map: {
|
|
|
|
amount: 'Amount',
|
|
|
|
description: 'Msre_Desc',
|
|
|
|
gram_weight: 'Gm_Wgt'
|
|
|
|
}
|
2018-09-14 19:32:49 -05:00
|
|
|
},
|
|
|
|
|
|
|
|
#Branded Food DB files
|
|
|
|
products: {
|
|
|
|
filename: 'branded/Products.csv',
|
|
|
|
key_column: 'NDB_Number',
|
|
|
|
csv: true,
|
|
|
|
columns: [
|
|
|
|
"NDB_Number",
|
|
|
|
"long_name",
|
|
|
|
"data_source",
|
|
|
|
"gtin_upc",
|
|
|
|
"manufacturer",
|
|
|
|
"date_modified",
|
|
|
|
"date_available",
|
|
|
|
"ingredients_english"
|
|
|
|
],
|
|
|
|
map: {
|
|
|
|
ndbn: 'NDB_Number',
|
|
|
|
long_description: 'long_name',
|
|
|
|
manufacturer: 'manufacturer',
|
|
|
|
ingredients: 'ingredients_english'
|
|
|
|
},
|
|
|
|
static: {
|
|
|
|
source: 'bf',
|
|
|
|
nutrient_unit: '100 g'
|
|
|
|
}
|
|
|
|
},
|
|
|
|
|
|
|
|
nutrients: {
|
|
|
|
filename: 'branded/Nutrients.csv',
|
|
|
|
key_column: 'NDB_No',
|
|
|
|
csv: true,
|
|
|
|
columns: [
|
|
|
|
"NDB_No",
|
|
|
|
"Nutrient_Code",
|
|
|
|
"Nutrient_name",
|
|
|
|
"Derivation_Code",
|
|
|
|
"Output_value",
|
|
|
|
"Output_uom"
|
|
|
|
],
|
|
|
|
map_function: ->(obj, row) do
|
|
|
|
map = BRANDED_NUTRIENTS[row['Nutrient_Code'].to_i]
|
|
|
|
if map && map[:unit] == row['Output_uom']
|
|
|
|
obj.send("#{map[:col]}=".to_sym, row['Output_value'])
|
|
|
|
end
|
|
|
|
end
|
|
|
|
},
|
|
|
|
|
|
|
|
serving_sizes: {
|
|
|
|
filename: 'branded/Serving_size.csv',
|
|
|
|
key_column: 'NDB_No',
|
|
|
|
csv: true,
|
|
|
|
map_into: 'usda_food_weights',
|
|
|
|
columns: [
|
|
|
|
"NDB_No",
|
|
|
|
"Serving_Size",
|
|
|
|
"Serving_Size_UOM",
|
|
|
|
"Household_Serving_Size",
|
|
|
|
"Household_Serving_Size_UOM",
|
|
|
|
"Preparation_State"
|
|
|
|
],
|
|
|
|
map: {
|
|
|
|
amount: 'Household_Serving_Size',
|
|
|
|
description: 'Household_Serving_Size_UOM',
|
|
|
|
gram_weight: 'Serving_Size'
|
|
|
|
}
|
2016-03-09 18:53:47 -06:00
|
|
|
}
|
2016-01-28 18:18:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
def initialize(directory)
|
|
|
|
@directory = directory
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
|
|
|
|
def import
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
UsdaFoodWeight.delete_all
|
2016-01-24 17:10:43 -06:00
|
|
|
UsdaFood.delete_all
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
sorted_files = {}
|
|
|
|
opened_files = {}
|
2016-01-28 18:18:45 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
# Sort each file by NDB_No
|
|
|
|
FILES.each do |name, data|
|
|
|
|
filename = File.join(@directory, data[:filename])
|
|
|
|
sorted_filename = "#{filename}.sorted"
|
2018-09-14 19:32:49 -05:00
|
|
|
idx = data[:columns].index(data[:key_column])
|
2016-03-09 18:53:47 -06:00
|
|
|
|
|
|
|
if idx
|
|
|
|
idx += 1
|
2018-09-14 19:32:49 -05:00
|
|
|
if data[:csv]
|
|
|
|
`head -n 1 #{filename} > #{sorted_filename}`
|
|
|
|
`tail -n +2 #{filename} | sort -n -t',' -k#{idx} - >> #{sorted_filename}`
|
|
|
|
else
|
|
|
|
`sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
|
|
|
|
end
|
2016-01-28 18:18:45 -06:00
|
|
|
end
|
2016-03-09 18:53:47 -06:00
|
|
|
|
|
|
|
sorted_files[name] = sorted_filename
|
2016-01-28 18:18:45 -06:00
|
|
|
end
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
begin
|
|
|
|
|
|
|
|
sorted_files.each do |name, filename|
|
|
|
|
data = FILES[name]
|
2018-09-14 19:32:49 -05:00
|
|
|
opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data))
|
2016-03-09 18:53:47 -06:00
|
|
|
end
|
|
|
|
|
|
|
|
build_enumerator(opened_files).each_slice(500) do |slice|
|
2016-01-24 17:10:43 -06:00
|
|
|
UsdaFood.transaction do
|
2016-03-09 18:53:47 -06:00
|
|
|
slice.each do |data|
|
2016-01-24 17:10:43 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
food = UsdaFood.new
|
|
|
|
|
|
|
|
data.each do |name, rows|
|
|
|
|
file_info = FILES[name]
|
|
|
|
obj = food
|
|
|
|
|
|
|
|
rows.each do |row|
|
2016-03-09 20:12:38 -06:00
|
|
|
if file_info[:map_into]
|
|
|
|
obj = food.send(file_info[:map_into]).build
|
|
|
|
end
|
|
|
|
|
2018-09-14 19:32:49 -05:00
|
|
|
if file_info[:static]
|
|
|
|
file_info[:static].each do |k, v|
|
|
|
|
obj.send("#{k}=", v)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if file_info[:map_function]
|
|
|
|
file_info[:map_function].call(obj, row)
|
|
|
|
else
|
|
|
|
file_info[:map].each do |db, col|
|
|
|
|
obj.send("#{db}=", row[col])
|
|
|
|
end
|
2016-03-09 18:53:47 -06:00
|
|
|
end
|
|
|
|
end
|
2016-01-28 18:18:45 -06:00
|
|
|
end
|
2016-01-24 17:10:43 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
food.save!
|
2016-01-24 17:10:43 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
end
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
end
|
2016-03-09 18:53:47 -06:00
|
|
|
|
|
|
|
|
|
|
|
ensure
|
|
|
|
opened_files.each { |k, v| v.close }
|
|
|
|
sorted_files.each { |k, v| `rm #{v}` }
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
|
2018-09-11 10:38:07 -05:00
|
|
|
Food.where('ndbn != ?', '').where('ndbn IS NOT NULL').each do |i|
|
2016-04-03 18:03:51 -05:00
|
|
|
i.set_usda_food(i.usda_food)
|
|
|
|
i.save!
|
|
|
|
end
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
end
|
2016-01-24 17:10:43 -06:00
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
def build_enumerator(opened_files)
|
|
|
|
enumerate_data = {}
|
|
|
|
opened_files.each do |name, csv|
|
2018-09-14 19:32:49 -05:00
|
|
|
file_data = FILES[name]
|
2016-03-09 18:53:47 -06:00
|
|
|
csv_enumerator = csv.each
|
|
|
|
enumerate_data[name] = {
|
|
|
|
enumerator: csv_enumerator,
|
|
|
|
done: false,
|
2018-09-14 19:32:49 -05:00
|
|
|
next_ndbn: csv_enumerator.peek[file_data[:key_column]],
|
|
|
|
peek_next_ndbn: -> { csv_enumerator.peek[file_data[:key_column]] }
|
2016-03-09 18:53:47 -06:00
|
|
|
}
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
|
2016-03-09 18:53:47 -06:00
|
|
|
Enumerator.new do |yielder|
|
|
|
|
loop do
|
|
|
|
break if enumerate_data.values.all? { |d| d[:done] }
|
|
|
|
|
2018-09-14 19:32:49 -05:00
|
|
|
current_ndbn = enumerate_data.select { |_, d| !d[:done] }.values.map { |d| d[:next_ndbn] }.min
|
2016-03-09 18:53:47 -06:00
|
|
|
results = Hash.new { |hash, key| hash[key] = [] }
|
|
|
|
|
|
|
|
enumerate_data.each do |name, data|
|
|
|
|
unless data[:done]
|
|
|
|
begin
|
|
|
|
while data[:next_ndbn] == current_ndbn
|
|
|
|
results[name] << data[:enumerator].next
|
2018-09-14 19:32:49 -05:00
|
|
|
data[:next_ndbn] = data[:peek_next_ndbn].call
|
2016-03-09 18:53:47 -06:00
|
|
|
end
|
|
|
|
rescue StopIteration
|
|
|
|
data[:done] = true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
yielder << results
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
|
2018-09-14 19:32:49 -05:00
|
|
|
def csv_options(data)
|
|
|
|
if data[:csv]
|
|
|
|
{ headers: true }
|
|
|
|
else
|
|
|
|
{ col_sep: '^', quote_char: '~', headers: data[:columns] }
|
|
|
|
end
|
2016-01-24 17:10:43 -06:00
|
|
|
end
|
|
|
|
|
|
|
|
end
|