Adding USDA weights; usda_importer refactor

2016-03-09 18:53:47 -06:00 · 2016-03-09 18:53:47 -06:00 · 2f752eae61
commit 2f752eae61
parent 068b01a7c8
10 changed files with 15727 additions and 117 deletions
--- a/app/models/usda_food.rb
+++ b/app/models/usda_food.rb
@ -1,6 +1,8 @@
 class UsdaFood < ActiveRecord::Base
  include TokenizedLike
  has_many :usda_food_weights
  def self.search(query)
    tokens = query.to_s.split(' ')
--- a/app/models/usda_food_weight.rb
+++ b/app/models/usda_food_weight.rb
@ -0,0 +1,5 @@
 class UsdaFoodWeight < ActiveRecord::Base
  belongs_to :usda_food
 end
--- a/db/migrate/20160309182253_create_usda_food_weights.rb
+++ b/db/migrate/20160309182253_create_usda_food_weights.rb
@ -0,0 +1,13 @@
 class CreateUsdaFoodWeights < ActiveRecord::Migration
  def change
    create_table :usda_food_weights do |t|
      t.integer :usda_food_id, index: true, null: false
      t.decimal :amount, precision: 5, scale: 3
      t.string :description
      t.decimal :gram_weight, precision: 7, scale: 1
      t.timestamps null: false
    end
  end
 end
--- a/db/schema.rb
+++ b/db/schema.rb
@ -11,7 +11,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
-ActiveRecord::Schema.define(version: 20160303180854) do
+ActiveRecord::Schema.define(version: 20160309182253) do
  create_table "ingredients", force: :cascade do |t|
    t.string   "name"
@ -68,6 +68,17 @@ ActiveRecord::Schema.define(version: 20160303180854) do
    t.integer  "user_id"
  end
  create_table "usda_food_weights", force: :cascade do |t|
    t.integer  "usda_food_id",                         null: false
    t.decimal  "amount",       precision: 5, scale: 3
    t.string   "description"
    t.decimal  "gram_weight",  precision: 7, scale: 1
    t.datetime "created_at",                           null: false
    t.datetime "updated_at",                           null: false
  end
  add_index "usda_food_weights", ["usda_food_id"], name: "index_usda_food_weights_on_usda_food_id"
  create_table "usda_foods", force: :cascade do |t|
    t.string   "ndbn",               limit: 5,                          null: false
    t.string   "long_description"
--- a/lib/usda_importer.rb
+++ b/lib/usda_importer.rb
@ -2,101 +2,126 @@ require 'csv'
 class UsdaImporter
-  ABBREV_COLUMNS = [
+  FILES = {
-    'NDB_No',
+      abbreviated: {
-    'Shrt_Desc',
+          filename: 'ABBREV.txt',
-    'Water',
+          columns: [
-    'Energ_Kcal',
+              'NDB_No',
-    'Protein',
+              'Shrt_Desc',
-    'Lipid_Tot',
+              'Water',
-    'Ash',
+              'Energ_Kcal',
-    'Carbohydrt',
+              'Protein',
-    'Fiber_TD',
+              'Lipid_Tot',
-    'Sugar_Tot',
+              'Ash',
-    'Calcium',
+              'Carbohydrt',
-    'Iron',
+              'Fiber_TD',
-    'Magnesium',
+              'Sugar_Tot',
-    'Phosphorus',
+              'Calcium',
-    'Potassium',
+              'Iron',
-    'Sodium',
+              'Magnesium',
-    'Zinc',
+              'Phosphorus',
-    'Copper',
+              'Potassium',
-    'Manganese',
+              'Sodium',
-    'Selenium',
+              'Zinc',
-    'Vit_C',
+              'Copper',
-    'Thiamin',
+              'Manganese',
-    'Riboflavin',
+              'Selenium',
-    'Niacin',
+              'Vit_C',
-    'Panto_acid',
+              'Thiamin',
-    'Vit_B6',
+              'Riboflavin',
-    'Folate_Tot',
+              'Niacin',
-    'Folic_acid',
+              'Panto_acid',
-    'Food_Folate',
+              'Vit_B6',
-    'Folate_DFE',
+              'Folate_Tot',
-    'Choline_Tot',
+              'Folic_acid',
-    'Vit_B12',
+              'Food_Folate',
-    'Vit_A_IU',
+              'Folate_DFE',
-    'Vit_A_RAE',
+              'Choline_Tot',
-    'Retinol',
+              'Vit_B12',
-    'Alpha_Carot',
+              'Vit_A_IU',
-    'Beta_Carot',
+              'Vit_A_RAE',
-    'Beta_Crypt',
+              'Retinol',
-    'Lycopene',
+              'Alpha_Carot',
-    'Lut+Zea',
+              'Beta_Carot',
-    'Vit_E',
+              'Beta_Crypt',
-    'Vit_D_mcg',
+              'Lycopene',
-    'Vit_D_IU',
+              'Lut+Zea',
-    'Vit_K',
+              'Vit_E',
-    'FA_Sat',
+              'Vit_D_mcg',
-    'FA_Mono',
+              'Vit_D_IU',
-    'FA_Poly',
+              'Vit_K',
-    'Cholestrl',
+              'FA_Sat',
-    'GmWt_1',
+              'FA_Mono',
-    'GmWt_Desc1',
+              'FA_Poly',
-    'GmWt_2',
+              'Cholestrl',
-    'GmWt_Desc2',
+              'GmWt_1',
-    'Refuse_Pct'
+              'GmWt_Desc1',
-  ]
+              'GmWt_2',
-  
+              'GmWt_Desc2',
-  FOOD_DATA_COLUMNS = [
+              'Refuse_Pct'
-      'NDB_No',
+          ],
-      'FdGrp_Cd',
+          map: {
-      'Long_Desc',
+              ndbn: 'NDB_No',
-      'Shrt_Desc',
+              short_description: 'Shrt_Desc',
-      'ComName',
+              water: 'Water',
-      'ManufacName',
+              kcal: 'Energ_Kcal',
-      'Survey',
+              protein: 'Protein',
-      'Ref_desc',
+              lipid: 'Lipid_Tot',
-      'Refuse',
+              ash: 'Ash',
-      'SciName',
+              carbohydrates: 'Carbohydrt',
-      'N_Factor',
+              fiber: 'Fiber_TD',
-      'Pro_Factor',
+              sugar: 'Sugar_Tot',
-      'Fat_Factor',
+              gram_weight_1: 'GmWt_1',
-      'CHO_Factor'
+              gram_weight_2: 'GmWt_2',
-  ]
+              gram_weight_desc_1: 'GmWt_Desc1',
              gram_weight_desc_2: 'GmWt_Desc2',
              refuse_percent: 'Refuse_Pct'
          }
      },
-  ABBREV_COLUMN_MAP = {
+      food_data: {
-      ndbn: 'NDB_No',
+          filename: 'FOOD_DES.txt',
-      short_description: 'Shrt_Desc',
+          columns: [
-      water: 'Water',
+              'NDB_No',
-      kcal: 'Energ_Kcal',
+              'FdGrp_Cd',
-      protein: 'Protein',
+              'Long_Desc',
-      lipid: 'Lipid_Tot',
+              'Shrt_Desc',
-      ash: 'Ash',
+              'ComName',
-      carbohydrates: 'Carbohydrt',
+              'ManufacName',
-      fiber: 'Fiber_TD',
+              'Survey',
-      sugar: 'Sugar_Tot',
+              'Ref_desc',
-      gram_weight_1: 'GmWt_1',
+              'Refuse',
-      gram_weight_2: 'GmWt_2',
+              'SciName',
-      gram_weight_desc_1: 'GmWt_Desc1',
+              'N_Factor',
-      gram_weight_desc_2: 'GmWt_Desc2',
+              'Pro_Factor',
-      refuse_percent: 'Refuse_Pct'
+              'Fat_Factor',
-  }
+              'CHO_Factor'
          ],
          map: {
              scientific_name: 'SciName',
              refuse_description: 'Ref_desc',
              long_description: 'Long_Desc'
          }
      },
-  FOOD_DATA_COLUMN_MAP = {
+      weights: {
-      scientific_name: 'SciName',
+          filename: 'WEIGHT.txt',
-      refuse_description: 'Ref_desc',
+          map_into: 'usda_food_weights',
-      long_description: 'Long_Desc'
+          columns: [
              'NDB_No',
              'Seq',
              'Amount',
              'Msre_Desc',
              'Gm_Wgt',
              'Num_Data_Pts',
              'Std_Dev'
          ],
          map: {
            amount: 'Amount',
            description: 'Msre_Desc',
            gram_weight: 'Gm_Wgt'
          }
      }
  }
  def initialize(directory)
@ -105,47 +130,145 @@ class UsdaImporter
  def import
    UsdaFoodWeight.delete_all
    UsdaFood.delete_all
-    food_data_lookup = {}
+    sorted_files = {}
    opened_files = {}
-    CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv|
+    # Sort each file by NDB_No
-      csv.each do |row|
+    FILES.each do |name, data|
-        food_data_lookup[row['NDB_No']] = row.to_h
+      filename = File.join(@directory, data[:filename])
      sorted_filename = "#{filename}.sorted"
      idx = data[:columns].index('NDB_No')
      if idx
        idx += 1
        `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
      end
      sorted_files[name] = sorted_filename
    end
-    CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv|
+    begin
-      csv.each_slice(500) do |slice|
+
      sorted_files.each do |name, filename|
        data = FILES[name]
        opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data[:columns]))
      end
      build_enumerator(opened_files).each_slice(500) do |slice|
        UsdaFood.transaction do
          slice.each do |data|
-          attributes = slice.map do |row|
+            food = UsdaFood.new
-            attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }]
+
-            lookup = food_data_lookup[attrs[:ndbn]]
+            data.each do |name, rows|
-            if lookup
+              file_info = FILES[name]
-              extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }]
+              obj = food
-              attrs.merge!(extra_attrs)
+
              if file_info[:map_into]
                obj = food.send(file_info[:map_into]).build
              end
              rows.each do |row|
                file_info[:map].each do |db, col|
                  obj.send("#{db}=", row[col])
                end
              end
            end
-            attrs
+
            food.save!
          end
          UsdaFood.create(attributes)
        end
      end
    ensure
      opened_files.each { |k, v| v.close }
      sorted_files.each { |k, v| `rm #{v}` }
    end
-    usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }]
+    # UsdaFood.delete_all
    #
    # food_data_lookup = {}
    #
    # CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv|
    #   csv.each do |row|
    #     food_data_lookup[row['NDB_No']] = row.to_h
    #   end
    # end
    #
    # CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv|
    #   csv.each_slice(500) do |slice|
    #     UsdaFood.transaction do
    #
    #       attributes = slice.map do |row|
    #         attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }]
    #         lookup = food_data_lookup[attrs[:ndbn]]
    #         if lookup
    #           extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }]
    #           attrs.merge!(extra_attrs)
    #         end
    #         attrs
    #       end
    #
    #       UsdaFood.create(attributes)
    #
    #     end
    #   end
    # end
    #
    # usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }]
    #
    # Ingredient.where('ndbn IS NOT NULL').each do |i|
    #   item = usda_items[i.ndbn]
    #
    #   if item
    #     i.set_usda_food(item)
    #     i.save
    #   end
    # end
-    Ingredient.where('ndbn IS NOT NULL').each do |i|
+  end
-      item = usda_items[i.ndbn]
+
  def build_enumerator(opened_files)
    enumerate_data = {}
    opened_files.each do |name, csv|
      csv_enumerator = csv.each
      enumerate_data[name] = {
          enumerator: csv_enumerator,
          done: false,
          next_ndbn: csv_enumerator.peek['NDB_No']
      }
    end
    Enumerator.new do |yielder|
      loop do
        break if enumerate_data.values.all? { |d| d[:done] }
        current_ndbn = enumerate_data.values.map { |d| d[:next_ndbn] }.min
        results = Hash.new { |hash, key| hash[key] = [] }
        enumerate_data.each do |name, data|
          unless data[:done]
            begin
              while data[:next_ndbn] == current_ndbn
                results[name] << data[:enumerator].next
                data[:next_ndbn] = data[:enumerator].peek['NDB_No']
              end
            rescue StopIteration
              data[:done] = true
            end
          end
        end
        yielder << results
      if item
        i.set_usda_food(item)
        i.save
      end
    end
    end
  end
  def csv_options(headers)
--- a/spec/lib/usda_importer_spec.rb
+++ b/spec/lib/usda_importer_spec.rb
@ -0,0 +1,5 @@
 require 'rails_helper'
 RSpec.describe UsdaImporter do
 end
--- a/spec/test_data/ABBREV.txt
+++ b/spec/test_data/ABBREV.txt
@ -0,0 +1,2 @@
 ~01001~^~BUTTER,WITH SALT~^15.87^717^0.85^81.11^2.11^0.06^0.0^0.06^24^0.02^2^24^24^643^0.09^0.000^0.000^1.0^0.0^0.005^0.034^0.042^0.110^0.003^3^0^3^3^18.8^0.17^2499^684^671^0^158^0^0^0^2.32^0.0^0^7.0^51.368^21.021^3.043^215^5.0^~1 pat,  (1" sq, 1/3" high)~^14.2^~1 tbsp~^0
 ~01006~^~CHEESE,BRIE~^48.42^334^20.75^27.68^2.70^0.45^0.0^0.45^184^0.50^20^188^152^629^2.38^0.019^0.034^14.5^0.0^0.070^0.520^0.380^0.690^0.235^65^0^65^65^15.4^1.65^592^174^173^0^9^0^0^0^0.24^0.5^20^2.3^17.410^8.013^0.826^100^28.35^~1 oz~^144^~1 cup, sliced~^0
--- a/spec/test_data/FOOD_DES.txt
+++ b/spec/test_data/FOOD_DES.txt
@ -0,0 +1,2 @@
 ~01001~^~0100~^~Butter, salted~^~BUTTER,WITH SALT~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
 ~01006~^~0100~^~Cheese, brie~^~CHEESE,BRIE~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
--- a/spec/test_data/WEIGHT.txt
+++ b/spec/test_data/WEIGHT.txt
@ -0,0 +1,9 @@
 ~01001~^1^1^~pat (1" sq, 1/3" high)~^5.0^^
 ~01001~^2^1^~tbsp~^14.2^^
 ~01001~^3^1^~cup~^227^^
 ~01001~^4^1^~stick~^113^^
 ~01006~^1^1^~oz~^28.35^^
 ~01006~^2^1^~cup, sliced~^144^^
 ~01006~^3^1^~cup, melted~^240^^
 ~01006~^4^1^~cubic inch~^17^^
 ~01006~^5^1^~package (4.5 oz)~^128^^
--- a/vendor/data/usda/WEIGHT.txt
+++ b/vendor/data/usda/WEIGHT.txt
		`@ -0,0 +1,2 @@`
							`~01001~^~BUTTER,WITH SALT~^15.87^717^0.85^81.11^2.11^0.06^0.0^0.06^24^0.02^2^24^24^643^0.09^0.000^0.000^1.0^0.0^0.005^0.034^0.042^0.110^0.003^3^0^3^3^18.8^0.17^2499^684^671^0^158^0^0^0^2.32^0.0^0^7.0^51.368^21.021^3.043^215^5.0^~1 pat, (1" sq, 1/3" high)~^14.2^~1 tbsp~^0`
							`~01006~^~CHEESE,BRIE~^48.42^334^20.75^27.68^2.70^0.45^0.0^0.45^184^0.50^20^188^152^629^2.38^0.019^0.034^14.5^0.0^0.070^0.520^0.380^0.690^0.235^65^0^65^65^15.4^1.65^592^174^173^0^9^0^0^0^0.24^0.5^20^2.3^17.410^8.013^0.826^100^28.35^~1 oz~^144^~1 cup, sliced~^0`
		`@ -0,0 +1,2 @@`
							`~01001~^~0100~^~Butter, salted~^~BUTTER,WITH SALT~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87`
							`~01006~^~0100~^~Cheese, brie~^~CHEESE,BRIE~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87`