parsley/lib/usda_importer.rb

require 'csv'

class UsdaImporter

  BRANDED_NUTRIENTS = {
    208 => {
      col: 'kcal',
      unit: 'kcal'
    },

    203 => {
      col: 'protein',
      unit: 'g'
    },

    255 => {
      col: 'water',
      unit: 'g'
    },

    204 => {
      col: 'lipid',
      unit: 'g'
    },

    207 => {
      col: 'ash',
      unit: 'g'
    },

    205 => {
      col: 'carbohydrates',
      unit: 'g'
    },

    291 => {
      col: 'fiber',
      unit: 'g'
    },

    269 => {
      col: 'sugar',
      unit: 'g'
    },

    301 => {
      col: 'calcium',
      unit: 'mg'
    },

    303 => {
      col: 'iron',
      unit: 'mg'
    },

    304 => {
      col: 'magnesium',
      unit: 'mg'
    },

    305 => {
      col: 'phosphorus',
      unit: 'mg'
    },

    306 => {
      col: 'potassium',
      unit: 'mg'
    },

    307 => {
      col: 'sodium',
      unit: 'mg'
    },

    309 => {
      col: 'zinc',
      unit: 'mg'
    },

    312 => {
      col: 'copper',
      unit: 'mg'
    },

    315 => {
      col: 'manganese',
      unit: 'mg'
    },

    401 => {
      col: 'vit_c',
      unit: 'mg'
    },

    415 => {
      col: 'vit_b6',
      unit: 'mg'
    },

    418 => {
      col: 'vit_b12',
      unit: 'mcg'
    },

    318 => {
      col: 'vit_a',
      unit: 'IU',
      convert: ->(x) { x.to_f * 0.3 }
    },

    324 => {
      col: 'vit_d',
      unit: 'IU',
      convert: ->(x) { x.to_f / 40.0 }
    },

    430 => {
      col: 'vit_k',
      unit: 'mcg'
    },

    601 => {
      col: 'cholesterol',
      unit: 'mg'
    }
  }

  FILES = {
      abbreviated: {
          filename: 'ABBREV.txt',
          key_column: 'NDB_No',
          columns: [
              'NDB_No',
              'Shrirt_Desc',
              'Water',
              'Energ_Kcal',
              'Protein',
              'Lipid_Tot',
              'Ash',
              'Carbohydrt',
              'Fiber_TD',
              'Sugar_Tot',
              'Calcium',
              'Iron',
              'Magnesium',
              'Phosphorus',
              'Potassium',
              'Sodium',
              'Zinc',
              'Copper',
              'Manganese',
              'Selenium',
              'Vit_C',
              'Thiamin',
              'Riboflavin',
              'Niacin',
              'Panto_acid',
              'Vit_B6',
              'Folate_Tot',
              'Folic_acid',
              'Food_Folate',
              'Folate_DFE',
              'Choline_Tot',
              'Vit_B12',
              'Vit_A_IU',
              'Vit_A_RAE',
              'Retinol',
              'Alpha_Carot',
              'Beta_Carot',
              'Beta_Crypt',
              'Lycopene',
              'Lut+Zea',
              'Vit_E',
              'Vit_D_mcg',
              'Vit_D_IU',
              'Vit_K',
              'FA_Sat',
              'FA_Mono',
              'FA_Poly',
              'Cholestrl',
              'GmWt_1',
              'GmWt_Desc1',
              'GmWt_2',
              'GmWt_Desc2',
              'Refuse_Pct'
          ],
          map: {
              ndbn: 'NDB_No',
              short_description: 'Shrt_Desc',
              water: 'Water',
              kcal: 'Energ_Kcal',
              protein: 'Protein',
              lipid: 'Lipid_Tot',
              ash: 'Ash',
              carbohydrates: 'Carbohydrt',
              fiber: 'Fiber_TD',
              sugar: 'Sugar_Tot',
              gram_weight_1: 'GmWt_1',
              gram_weight_2: 'GmWt_2',
              gram_weight_desc_1: 'GmWt_Desc1',
              gram_weight_desc_2: 'GmWt_Desc2',
              refuse_percent: 'Refuse_Pct',
              calcium: 'Calcium',
              iron: 'Iron',
              magnesium: 'Magnesium',
              phosphorus: 'Phosphorus',
              potassium: 'Potassium',
              sodium: 'Sodium',
              zinc: 'Zinc',
              copper: 'Copper',
              manganese: 'Manganese',
              vit_c: 'Vit_C',
              vit_b6: 'Vit_B6',
              vit_b12: 'Vit_B12',
              vit_a: 'Vit_A_RAE',
              vit_e: 'Vit_E',
              vit_d: 'Vit_D_mcg',
              vit_k: 'Vit_K',
              cholesterol: 'Cholestrl'
          },
          static: {
            source: 'sr',
            nutrient_unit: '100 grams'
          }
      },

      food_data: {
          filename: 'FOOD_DES.txt',
          key_column: 'NDB_No',
          columns: [
              'NDB_No',
              'FdGrp_Cd',
              'Long_Desc',
              'Shrt_Desc',
              'ComName',
              'ManufacName',
              'Survey',
              'Ref_desc',
              'Refuse',
              'SciName',
              'N_Factor',
              'Pro_Factor',
              'Fat_Factor',
              'CHO_Factor'
          ],
          map: {
              scientific_name: 'SciName',
              refuse_description: 'Ref_desc',
              long_description: 'Long_Desc'
          }
      },

      weights: {
          filename: 'WEIGHT.txt',
          key_column: 'NDB_No',
          map_into: 'usda_food_weights',
          columns: [
              'NDB_No',
              'Seq',
              'Amount',
              'Msre_Desc',
              'Gm_Wgt',
              'Num_Data_Pts',
              'Std_Dev'
          ],
          map: {
            amount: 'Amount',
            description: 'Msre_Desc',
            gram_weight: 'Gm_Wgt'
          }
      },

      #Branded Food DB files
      products: {
        filename: 'branded/Products.csv',
        key_column: 'NDB_Number',
        csv: true,
        columns: [
          "NDB_Number",
          "long_name",
          "data_source",
          "gtin_upc",
          "manufacturer",
          "date_modified",
          "date_available",
          "ingredients_english"
        ],
        map: {
          ndbn: 'NDB_Number',
          long_description: 'long_name',
          manufacturer: 'manufacturer',
          ingredients: 'ingredients_english'
        },
        static: {
          source: 'bf',
          nutrient_unit: '100 g'
        }
      },

      nutrients: {
        filename: 'branded/Nutrients.csv',
        key_column: 'NDB_No',
        csv: true,
        columns: [
          "NDB_No",
          "Nutrient_Code",
          "Nutrient_name",
          "Derivation_Code",
          "Output_value",
          "Output_uom"
        ],
        map_function: ->(obj, row) do
          map = BRANDED_NUTRIENTS[row['Nutrient_Code'].to_i]
          if map && map[:unit] == row['Output_uom']
            obj.send("#{map[:col]}=".to_sym, row['Output_value'])
          end
        end
      },

      serving_sizes: {
        filename: 'branded/Serving_size.csv',
        key_column: 'NDB_No',
        csv: true,
        map_into: 'usda_food_weights',
        columns: [
          "NDB_No",
          "Serving_Size",
          "Serving_Size_UOM",
          "Household_Serving_Size",
          "Household_Serving_Size_UOM",
          "Preparation_State"
        ],
        map: {
          amount: 'Household_Serving_Size',
          description: 'Household_Serving_Size_UOM',
          gram_weight: 'Serving_Size'
        }
      }
  }

  def initialize(directory)
    @directory = directory
  end

  def import

    UsdaFoodWeight.delete_all
    UsdaFood.delete_all

    sorted_files = {}
    opened_files = {}

    # Sort each file by NDB_No
    FILES.each do |name, data|
      filename = File.join(@directory, data[:filename])
      sorted_filename = "#{filename}.sorted"
      idx = data[:columns].index(data[:key_column])

      if idx
        idx += 1
        if data[:csv]
          `head -n 1 #{filename} > #{sorted_filename}`
          `tail -n +2 #{filename} | sort -n -t',' -k#{idx} - >> #{sorted_filename}`
        else
          `sort -n -t'^' -k#{idx}.2,#{idx}.6 #{filename} > #{sorted_filename}`
        end
      end

      sorted_files[name] = sorted_filename
    end

    begin

      sorted_files.each do |name, filename|
        data = FILES[name]
        opened_files[name] = CSV.open(filename, 'r:iso-8859-1:utf-8', csv_options(data))
      end

      build_enumerator(opened_files).each_slice(500) do |slice|
        UsdaFood.transaction do
          slice.each do |data|

            food = UsdaFood.new

            data.each do |name, rows|
              file_info = FILES[name]
              obj = food

              rows.each do |row|
                if file_info[:map_into]
                  obj = food.send(file_info[:map_into]).build
                end

                if file_info[:static]
                  file_info[:static].each do |k, v|
                    obj.send("#{k}=", v)
                  end
                end

                if file_info[:map_function]
                  file_info[:map_function].call(obj, row)
                else
                  file_info[:map].each do |db, col|
                    obj.send("#{db}=", row[col])
                  end
                end
              end
            end

            food.save!

          end
        end
      end


    ensure
      opened_files.each { |k, v| v.close }
      sorted_files.each { |k, v| `rm #{v}` }
    end

    Food.where('ndbn != ?', '').where('ndbn IS NOT NULL').each do |i|
      i.set_usda_food(i.usda_food)
      i.save!
    end

  end

  def build_enumerator(opened_files)
    enumerate_data = {}
    opened_files.each do |name, csv|
      file_data = FILES[name]
      csv_enumerator = csv.each
      enumerate_data[name] = {
          enumerator: csv_enumerator,
          done: false,
          next_ndbn: csv_enumerator.peek[file_data[:key_column]],
          peek_next_ndbn: -> { csv_enumerator.peek[file_data[:key_column]] }
      }
    end

    Enumerator.new do |yielder|
      loop do
        break if enumerate_data.values.all? { |d| d[:done] }

        current_ndbn = enumerate_data.select { |_, d| !d[:done] }.values.map { |d| d[:next_ndbn] }.min
        results = Hash.new { |hash, key| hash[key] = [] }

        enumerate_data.each do |name, data|
          unless data[:done]
            begin
              while data[:next_ndbn] == current_ndbn
                results[name] << data[:enumerator].next
                data[:next_ndbn] = data[:peek_next_ndbn].call
              end
            rescue StopIteration
              data[:done] = true
            end
          end
        end

        yielder << results

      end

    end
  end

  def csv_options(data)
    if data[:csv]
      { headers: true }
    else
      { col_sep: '^', quote_char: '~', headers: data[:columns] }
    end
  end

end