Adding USDA weights; usda_importer refactor

This commit is contained in:
Dan Elbert 2016-03-09 18:53:47 -06:00
parent 068b01a7c8
commit 2f752eae61
10 changed files with 15727 additions and 117 deletions

View File

@ -1,6 +1,8 @@
class UsdaFood < ActiveRecord::Base class UsdaFood < ActiveRecord::Base
include TokenizedLike include TokenizedLike
has_many :usda_food_weights
def self.search(query) def self.search(query)
tokens = query.to_s.split(' ') tokens = query.to_s.split(' ')

View File

@ -0,0 +1,5 @@
class UsdaFoodWeight < ActiveRecord::Base
belongs_to :usda_food
end

View File

@ -0,0 +1,13 @@
class CreateUsdaFoodWeights < ActiveRecord::Migration
def change
create_table :usda_food_weights do |t|
t.integer :usda_food_id, index: true, null: false
t.decimal :amount, precision: 5, scale: 3
t.string :description
t.decimal :gram_weight, precision: 7, scale: 1
t.timestamps null: false
end
end
end

View File

@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20160303180854) do ActiveRecord::Schema.define(version: 20160309182253) do
create_table "ingredients", force: :cascade do |t| create_table "ingredients", force: :cascade do |t|
t.string "name" t.string "name"
@ -68,6 +68,17 @@ ActiveRecord::Schema.define(version: 20160303180854) do
t.integer "user_id" t.integer "user_id"
end end
create_table "usda_food_weights", force: :cascade do |t|
t.integer "usda_food_id", null: false
t.decimal "amount", precision: 5, scale: 3
t.string "description"
t.decimal "gram_weight", precision: 7, scale: 1
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
add_index "usda_food_weights", ["usda_food_id"], name: "index_usda_food_weights_on_usda_food_id"
create_table "usda_foods", force: :cascade do |t| create_table "usda_foods", force: :cascade do |t|
t.string "ndbn", limit: 5, null: false t.string "ndbn", limit: 5, null: false
t.string "long_description" t.string "long_description"

View File

@ -2,101 +2,126 @@ require 'csv'
class UsdaImporter class UsdaImporter
ABBREV_COLUMNS = [ FILES = {
'NDB_No', abbreviated: {
'Shrt_Desc', filename: 'ABBREV.txt',
'Water', columns: [
'Energ_Kcal', 'NDB_No',
'Protein', 'Shrt_Desc',
'Lipid_Tot', 'Water',
'Ash', 'Energ_Kcal',
'Carbohydrt', 'Protein',
'Fiber_TD', 'Lipid_Tot',
'Sugar_Tot', 'Ash',
'Calcium', 'Carbohydrt',
'Iron', 'Fiber_TD',
'Magnesium', 'Sugar_Tot',
'Phosphorus', 'Calcium',
'Potassium', 'Iron',
'Sodium', 'Magnesium',
'Zinc', 'Phosphorus',
'Copper', 'Potassium',
'Manganese', 'Sodium',
'Selenium', 'Zinc',
'Vit_C', 'Copper',
'Thiamin', 'Manganese',
'Riboflavin', 'Selenium',
'Niacin', 'Vit_C',
'Panto_acid', 'Thiamin',
'Vit_B6', 'Riboflavin',
'Folate_Tot', 'Niacin',
'Folic_acid', 'Panto_acid',
'Food_Folate', 'Vit_B6',
'Folate_DFE', 'Folate_Tot',
'Choline_Tot', 'Folic_acid',
'Vit_B12', 'Food_Folate',
'Vit_A_IU', 'Folate_DFE',
'Vit_A_RAE', 'Choline_Tot',
'Retinol', 'Vit_B12',
'Alpha_Carot', 'Vit_A_IU',
'Beta_Carot', 'Vit_A_RAE',
'Beta_Crypt', 'Retinol',
'Lycopene', 'Alpha_Carot',
'Lut+Zea', 'Beta_Carot',
'Vit_E', 'Beta_Crypt',
'Vit_D_mcg', 'Lycopene',
'Vit_D_IU', 'Lut+Zea',
'Vit_K', 'Vit_E',
'FA_Sat', 'Vit_D_mcg',
'FA_Mono', 'Vit_D_IU',
'FA_Poly', 'Vit_K',
'Cholestrl', 'FA_Sat',
'GmWt_1', 'FA_Mono',
'GmWt_Desc1', 'FA_Poly',
'GmWt_2', 'Cholestrl',
'GmWt_Desc2', 'GmWt_1',
'Refuse_Pct' 'GmWt_Desc1',
] 'GmWt_2',
'GmWt_Desc2',
FOOD_DATA_COLUMNS = [ 'Refuse_Pct'
'NDB_No', ],
'FdGrp_Cd', map: {
'Long_Desc', ndbn: 'NDB_No',
'Shrt_Desc', short_description: 'Shrt_Desc',
'ComName', water: 'Water',
'ManufacName', kcal: 'Energ_Kcal',
'Survey', protein: 'Protein',
'Ref_desc', lipid: 'Lipid_Tot',
'Refuse', ash: 'Ash',
'SciName', carbohydrates: 'Carbohydrt',
'N_Factor', fiber: 'Fiber_TD',
'Pro_Factor', sugar: 'Sugar_Tot',
'Fat_Factor', gram_weight_1: 'GmWt_1',
'CHO_Factor' gram_weight_2: 'GmWt_2',
] gram_weight_desc_1: 'GmWt_Desc1',
gram_weight_desc_2: 'GmWt_Desc2',
refuse_percent: 'Refuse_Pct'
}
},
ABBREV_COLUMN_MAP = { food_data: {
ndbn: 'NDB_No', filename: 'FOOD_DES.txt',
short_description: 'Shrt_Desc', columns: [
water: 'Water', 'NDB_No',
kcal: 'Energ_Kcal', 'FdGrp_Cd',
protein: 'Protein', 'Long_Desc',
lipid: 'Lipid_Tot', 'Shrt_Desc',
ash: 'Ash', 'ComName',
carbohydrates: 'Carbohydrt', 'ManufacName',
fiber: 'Fiber_TD', 'Survey',
sugar: 'Sugar_Tot', 'Ref_desc',
gram_weight_1: 'GmWt_1', 'Refuse',
gram_weight_2: 'GmWt_2', 'SciName',
gram_weight_desc_1: 'GmWt_Desc1', 'N_Factor',
gram_weight_desc_2: 'GmWt_Desc2', 'Pro_Factor',
refuse_percent: 'Refuse_Pct' 'Fat_Factor',
} 'CHO_Factor'
],
map: {
scientific_name: 'SciName',
refuse_description: 'Ref_desc',
long_description: 'Long_Desc'
}
},
FOOD_DATA_COLUMN_MAP = { weights: {
scientific_name: 'SciName', filename: 'WEIGHT.txt',
refuse_description: 'Ref_desc', map_into: 'usda_food_weights',
long_description: 'Long_Desc' columns: [
'NDB_No',
'Seq',
'Amount',
'Msre_Desc',
'Gm_Wgt',
'Num_Data_Pts',
'Std_Dev'
],
map: {
amount: 'Amount',
description: 'Msre_Desc',
gram_weight: 'Gm_Wgt'
}
}
} }
def initialize(directory) def initialize(directory)
@ -105,47 +130,145 @@ class UsdaImporter
def import def import
UsdaFoodWeight.delete_all
UsdaFood.delete_all UsdaFood.delete_all
food_data_lookup = {} sorted_files = {}
opened_files = {}
CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv| # Sort each file by NDB_No
csv.each do |row| FILES.each do |name, data|
food_data_lookup[row['NDB_No']] = row.to_h filename = File.join(@directory, data[:filename])
sorted_filename = "#{filename}.sorted"
idx = data[:columns].index('NDB_No')
if idx
idx += 1
`sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
end end
sorted_files[name] = sorted_filename
end end
CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv| begin
csv.each_slice(500) do |slice|
sorted_files.each do |name, filename|
data = FILES[name]
opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data[:columns]))
end
build_enumerator(opened_files).each_slice(500) do |slice|
UsdaFood.transaction do UsdaFood.transaction do
slice.each do |data|
attributes = slice.map do |row| food = UsdaFood.new
attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }]
lookup = food_data_lookup[attrs[:ndbn]] data.each do |name, rows|
if lookup file_info = FILES[name]
extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }] obj = food
attrs.merge!(extra_attrs)
if file_info[:map_into]
obj = food.send(file_info[:map_into]).build
end
rows.each do |row|
file_info[:map].each do |db, col|
obj.send("#{db}=", row[col])
end
end
end end
attrs
food.save!
end end
UsdaFood.create(attributes)
end end
end end
ensure
opened_files.each { |k, v| v.close }
sorted_files.each { |k, v| `rm #{v}` }
end end
usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }] # UsdaFood.delete_all
#
# food_data_lookup = {}
#
# CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv|
# csv.each do |row|
# food_data_lookup[row['NDB_No']] = row.to_h
# end
# end
#
# CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv|
# csv.each_slice(500) do |slice|
# UsdaFood.transaction do
#
# attributes = slice.map do |row|
# attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }]
# lookup = food_data_lookup[attrs[:ndbn]]
# if lookup
# extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }]
# attrs.merge!(extra_attrs)
# end
# attrs
# end
#
# UsdaFood.create(attributes)
#
# end
# end
# end
#
# usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }]
#
# Ingredient.where('ndbn IS NOT NULL').each do |i|
# item = usda_items[i.ndbn]
#
# if item
# i.set_usda_food(item)
# i.save
# end
# end
Ingredient.where('ndbn IS NOT NULL').each do |i| end
item = usda_items[i.ndbn]
def build_enumerator(opened_files)
enumerate_data = {}
opened_files.each do |name, csv|
csv_enumerator = csv.each
enumerate_data[name] = {
enumerator: csv_enumerator,
done: false,
next_ndbn: csv_enumerator.peek['NDB_No']
}
end
Enumerator.new do |yielder|
loop do
break if enumerate_data.values.all? { |d| d[:done] }
current_ndbn = enumerate_data.values.map { |d| d[:next_ndbn] }.min
results = Hash.new { |hash, key| hash[key] = [] }
enumerate_data.each do |name, data|
unless data[:done]
begin
while data[:next_ndbn] == current_ndbn
results[name] << data[:enumerator].next
data[:next_ndbn] = data[:enumerator].peek['NDB_No']
end
rescue StopIteration
data[:done] = true
end
end
end
yielder << results
if item
i.set_usda_food(item)
i.save
end end
end
end
end end
def csv_options(headers) def csv_options(headers)

View File

@ -0,0 +1,5 @@
require 'rails_helper'
RSpec.describe UsdaImporter do
end

View File

@ -0,0 +1,2 @@
~01001~^~BUTTER,WITH SALT~^15.87^717^0.85^81.11^2.11^0.06^0.0^0.06^24^0.02^2^24^24^643^0.09^0.000^0.000^1.0^0.0^0.005^0.034^0.042^0.110^0.003^3^0^3^3^18.8^0.17^2499^684^671^0^158^0^0^0^2.32^0.0^0^7.0^51.368^21.021^3.043^215^5.0^~1 pat, (1" sq, 1/3" high)~^14.2^~1 tbsp~^0
~01006~^~CHEESE,BRIE~^48.42^334^20.75^27.68^2.70^0.45^0.0^0.45^184^0.50^20^188^152^629^2.38^0.019^0.034^14.5^0.0^0.070^0.520^0.380^0.690^0.235^65^0^65^65^15.4^1.65^592^174^173^0^9^0^0^0^0.24^0.5^20^2.3^17.410^8.013^0.826^100^28.35^~1 oz~^144^~1 cup, sliced~^0

View File

@ -0,0 +1,2 @@
~01001~^~0100~^~Butter, salted~^~BUTTER,WITH SALT~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
~01006~^~0100~^~Cheese, brie~^~CHEESE,BRIE~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87

View File

@ -0,0 +1,9 @@
~01001~^1^1^~pat (1" sq, 1/3" high)~^5.0^^
~01001~^2^1^~tbsp~^14.2^^
~01001~^3^1^~cup~^227^^
~01001~^4^1^~stick~^113^^
~01006~^1^1^~oz~^28.35^^
~01006~^2^1^~cup, sliced~^144^^
~01006~^3^1^~cup, melted~^240^^
~01006~^4^1^~cubic inch~^17^^
~01006~^5^1^~package (4.5 oz)~^128^^

15438
vendor/data/usda/WEIGHT.txt vendored Normal file

File diff suppressed because it is too large Load Diff