parsley/lib/usda_importer.rb

require 'csv'

class UsdaImporter

  FILES = {
      abbreviated: {
          filename: 'ABBREV.txt',
          columns: [
              'NDB_No',
              'Shrt_Desc',
              'Water',
              'Energ_Kcal',
              'Protein',
              'Lipid_Tot',
              'Ash',
              'Carbohydrt',
              'Fiber_TD',
              'Sugar_Tot',
              'Calcium',
              'Iron',
              'Magnesium',
              'Phosphorus',
              'Potassium',
              'Sodium',
              'Zinc',
              'Copper',
              'Manganese',
              'Selenium',
              'Vit_C',
              'Thiamin',
              'Riboflavin',
              'Niacin',
              'Panto_acid',
              'Vit_B6',
              'Folate_Tot',
              'Folic_acid',
              'Food_Folate',
              'Folate_DFE',
              'Choline_Tot',
              'Vit_B12',
              'Vit_A_IU',
              'Vit_A_RAE',
              'Retinol',
              'Alpha_Carot',
              'Beta_Carot',
              'Beta_Crypt',
              'Lycopene',
              'Lut+Zea',
              'Vit_E',
              'Vit_D_mcg',
              'Vit_D_IU',
              'Vit_K',
              'FA_Sat',
              'FA_Mono',
              'FA_Poly',
              'Cholestrl',
              'GmWt_1',
              'GmWt_Desc1',
              'GmWt_2',
              'GmWt_Desc2',
              'Refuse_Pct'
          ],
          map: {
              ndbn: 'NDB_No',
              short_description: 'Shrt_Desc',
              water: 'Water',
              kcal: 'Energ_Kcal',
              protein: 'Protein',
              lipid: 'Lipid_Tot',
              ash: 'Ash',
              carbohydrates: 'Carbohydrt',
              fiber: 'Fiber_TD',
              sugar: 'Sugar_Tot',
              gram_weight_1: 'GmWt_1',
              gram_weight_2: 'GmWt_2',
              gram_weight_desc_1: 'GmWt_Desc1',
              gram_weight_desc_2: 'GmWt_Desc2',
              refuse_percent: 'Refuse_Pct'
          }
      },

      food_data: {
          filename: 'FOOD_DES.txt',
          columns: [
              'NDB_No',
              'FdGrp_Cd',
              'Long_Desc',
              'Shrt_Desc',
              'ComName',
              'ManufacName',
              'Survey',
              'Ref_desc',
              'Refuse',
              'SciName',
              'N_Factor',
              'Pro_Factor',
              'Fat_Factor',
              'CHO_Factor'
          ],
          map: {
              scientific_name: 'SciName',
              refuse_description: 'Ref_desc',
              long_description: 'Long_Desc'
          }
      },

      weights: {
          filename: 'WEIGHT.txt',
          map_into: 'usda_food_weights',
          columns: [
              'NDB_No',
              'Seq',
              'Amount',
              'Msre_Desc',
              'Gm_Wgt',
              'Num_Data_Pts',
              'Std_Dev'
          ],
          map: {
            amount: 'Amount',
            description: 'Msre_Desc',
            gram_weight: 'Gm_Wgt'
          }
      }
  }

  def initialize(directory)
    @directory = directory
  end

  def import

    UsdaFoodWeight.delete_all
    UsdaFood.delete_all

    sorted_files = {}
    opened_files = {}

    # Sort each file by NDB_No
    FILES.each do |name, data|
      filename = File.join(@directory, data[:filename])
      sorted_filename = "#{filename}.sorted"
      idx = data[:columns].index('NDB_No')

      if idx
        idx += 1
        `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
      end

      sorted_files[name] = sorted_filename
    end

    begin

      sorted_files.each do |name, filename|
        data = FILES[name]
        opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data[:columns]))
      end

      build_enumerator(opened_files).each_slice(500) do |slice|
        UsdaFood.transaction do
          slice.each do |data|

            food = UsdaFood.new

            data.each do |name, rows|
              file_info = FILES[name]
              obj = food

              if file_info[:map_into]
                obj = food.send(file_info[:map_into]).build
              end

              rows.each do |row|
                file_info[:map].each do |db, col|
                  obj.send("#{db}=", row[col])
                end
              end
            end

            food.save!

          end
        end
      end


    ensure
      opened_files.each { |k, v| v.close }
      sorted_files.each { |k, v| `rm #{v}` }
    end

    # UsdaFood.delete_all
    #
    # food_data_lookup = {}
    #
    # CSV.open(File.join(@directory, 'FOOD_DES.txt'), 'r:iso-8859-1:utf-8', csv_options(FOOD_DATA_COLUMNS)) do |csv|
    #   csv.each do |row|
    #     food_data_lookup[row['NDB_No']] = row.to_h
    #   end
    # end
    #
    # CSV.open(File.join(@directory, 'ABBREV.txt'), 'r:iso-8859-1:utf-8', csv_options(ABBREV_COLUMNS)) do |csv|
    #   csv.each_slice(500) do |slice|
    #     UsdaFood.transaction do
    #
    #       attributes = slice.map do |row|
    #         attrs = Hash[ABBREV_COLUMN_MAP.map { |db, col| [db, row[col]] }]
    #         lookup = food_data_lookup[attrs[:ndbn]]
    #         if lookup
    #           extra_attrs = Hash[FOOD_DATA_COLUMN_MAP.map { |db, col| [db, lookup[col]] }]
    #           attrs.merge!(extra_attrs)
    #         end
    #         attrs
    #       end
    #
    #       UsdaFood.create(attributes)
    #
    #     end
    #   end
    # end
    #
    # usda_items = Hash[UsdaFood.where(ndbn: Ingredient.select(:ndbn)).map { |uf| [uf.ndbn, uf] }]
    #
    # Ingredient.where('ndbn IS NOT NULL').each do |i|
    #   item = usda_items[i.ndbn]
    #
    #   if item
    #     i.set_usda_food(item)
    #     i.save
    #   end
    # end

  end

  def build_enumerator(opened_files)
    enumerate_data = {}
    opened_files.each do |name, csv|
      csv_enumerator = csv.each
      enumerate_data[name] = {
          enumerator: csv_enumerator,
          done: false,
          next_ndbn: csv_enumerator.peek['NDB_No']
      }
    end

    Enumerator.new do |yielder|
      loop do
        break if enumerate_data.values.all? { |d| d[:done] }

        current_ndbn = enumerate_data.values.map { |d| d[:next_ndbn] }.min
        results = Hash.new { |hash, key| hash[key] = [] }

        enumerate_data.each do |name, data|
          unless data[:done]
            begin
              while data[:next_ndbn] == current_ndbn
                results[name] << data[:enumerator].next
                data[:next_ndbn] = data[:enumerator].peek['NDB_No']
              end
            rescue StopIteration
              data[:done] = true
            end
          end
        end

        yielder << results

      end

    end
  end

  def csv_options(headers)
    { col_sep: '^', quote_char: '~', headers: headers }
  end

end