ruby/lib/csv/table.rb

# frozen_string_literal: true

require "forwardable"

class CSV
  #
  # A CSV::Table is a two-dimensional data structure for representing CSV
  # documents. Tables allow you to work with the data by row or column,
  # manipulate the data, and even convert the results back to CSV, if needed.
  #
  # All tables returned by CSV will be constructed from this class, if header
  # row processing is activated.
  #
  class Table
    #
    # Constructs a new CSV::Table from +array_of_rows+, which are expected
    # to be CSV::Row objects. All rows are assumed to have the same headers.
    #
    # The optional +headers+ parameter can be set to Array of headers.
    # If headers aren't set, headers are fetched from CSV::Row objects.
    # Otherwise, headers() method will return headers being set in
    # headers argument.
    #
    # A CSV::Table object supports the following Array methods through
    # delegation:
    #
    # * empty?()
    # * length()
    # * size()
    #
    def initialize(array_of_rows, headers: nil)
      @table = array_of_rows
      @headers = headers
      unless @headers
        if @table.empty?
          @headers = []
        else
          @headers = @table.first.headers
        end
      end

      @mode  = :col_or_row
    end

    # The current access mode for indexing and iteration.
    attr_reader :mode

    # Internal data format used to compare equality.
    attr_reader :table
    protected   :table

    ### Array Delegation ###

    extend Forwardable
    def_delegators :@table, :empty?, :length, :size

    #
    # Returns a duplicate table object, in column mode. This is handy for
    # chaining in a single call without changing the table mode, but be aware
    # that this method can consume a fair amount of memory for bigger data sets.
    #
    # This method returns the duplicate table for chaining. Don't chain
    # destructive methods (like []=()) this way though, since you are working
    # with a duplicate.
    #
    def by_col
      self.class.new(@table.dup).by_col!
    end

    #
    # Switches the mode of this table to column mode. All calls to indexing and
    # iteration methods will work with columns until the mode is changed again.
    #
    # This method returns the table and is safe to chain.
    #
    def by_col!
      @mode = :col

      self
    end

    #
    # Returns a duplicate table object, in mixed mode. This is handy for
    # chaining in a single call without changing the table mode, but be aware
    # that this method can consume a fair amount of memory for bigger data sets.
    #
    # This method returns the duplicate table for chaining.  Don't chain
    # destructive methods (like []=()) this way though, since you are working
    # with a duplicate.
    #
    def by_col_or_row
      self.class.new(@table.dup).by_col_or_row!
    end

    #
    # Switches the mode of this table to mixed mode. All calls to indexing and
    # iteration methods will use the default intelligent indexing system until
    # the mode is changed again. In mixed mode an index is assumed to be a row
    # reference while anything else is assumed to be column access by headers.
    #
    # This method returns the table and is safe to chain.
    #
    def by_col_or_row!
      @mode = :col_or_row

      self
    end

    #
    # Returns a duplicate table object, in row mode.  This is handy for chaining
    # in a single call without changing the table mode, but be aware that this
    # method can consume a fair amount of memory for bigger data sets.
    #
    # This method returns the duplicate table for chaining.  Don't chain
    # destructive methods (like []=()) this way though, since you are working
    # with a duplicate.
    #
    def by_row
      self.class.new(@table.dup).by_row!
    end

    #
    # Switches the mode of this table to row mode. All calls to indexing and
    # iteration methods will work with rows until the mode is changed again.
    #
    # This method returns the table and is safe to chain.
    #
    def by_row!
      @mode = :row

      self
    end

    #
    # Returns the headers for the first row of this table (assumed to match all
    # other rows). The headers Array passed to CSV::Table.new is returned for
    # empty tables.
    #
    def headers
      if @table.empty?
        @headers.dup
      else
        @table.first.headers
      end
    end

    # :call-seq:
    #   table[n] -> row
    #   table[range] -> array_of_rows
    #   table[header] -> array_of_fields
    #
    # Returns data from the table;  does not modify the table.
    #
    # ---
    #
    # The expression <tt>table[n]</tt>, where +n+ is a non-negative \Integer,
    # returns the +n+th row of the table, if that row exists,
    # and if the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_row! # => #<CSV::Table mode:row row_count:4>
    #   table[1] # => #<CSV::Row "Name":"bar" "Value":"1">
    #   table.by_col_or_row! # => #<CSV::Table mode:col_or_row row_count:4>
    #   table[1] # => #<CSV::Row "Name":"bar" "Value":"1">
    #
    # Counts backward from the last row if +n+ is negative:
    #   table[-1] # => #<CSV::Row "Name":"baz" "Value":"2">
    #
    # Returns +nil+ if +n+ is too large or too small:
    #   table[4] # => nil
    #   table[-4] => nil
    #
    # Raises an exception if the access mode is <tt>:row</tt>
    # and +n+ is not an
    # {Integer-convertible object}[https://docs.ruby-lang.org/en/master/implicit_conversion_rdoc.html#label-Integer-Convertible+Objects].
    #   table.by_row! # => #<CSV::Table mode:row row_count:4>
    #   # Raises TypeError (no implicit conversion of String into Integer):
    #   table['Name']
    #
    # ---
    #
    # The expression <tt>table[range]</tt>, where +range+ is a Range object,
    # returns rows from the table, beginning at row <tt>range.first</tt>,
    # if those rows exist, and if the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_row! # => #<CSV::Table mode:row row_count:4>
    #   rows = table[1..2] # => #<CSV::Row "Name":"bar" "Value":"1">
    #   rows # => [#<CSV::Row "Name":"bar" "Value":"1">, #<CSV::Row "Name":"baz" "Value":"2">]
    #   table.by_col_or_row! # => #<CSV::Table mode:col_or_row row_count:4>
    #   rows = table[1..2] # => #<CSV::Row "Name":"bar" "Value":"1">
    #   rows # => [#<CSV::Row "Name":"bar" "Value":"1">, #<CSV::Row "Name":"baz" "Value":"2">]
    #
    # If there are too few rows, returns all from <tt>range.first</tt> to the end:
    #   rows = table[1..50] # => #<CSV::Row "Name":"bar" "Value":"1">
    #   rows # => [#<CSV::Row "Name":"bar" "Value":"1">, #<CSV::Row "Name":"baz" "Value":"2">]
    #
    # Special case:  if <tt>range.start == table.size</tt>, returns an empty \Array:
    #   table[table.size..50] # => []
    #
    # If <tt>range.end</tt> is negative, calculates the ending index from the end:
    #   rows = table[0..-1]
    #   rows # => [#<CSV::Row "Name":"foo" "Value":"0">, #<CSV::Row "Name":"bar" "Value":"1">, #<CSV::Row "Name":"baz" "Value":"2">]
    #
    # If <tt>range.start</tt> is negative, calculates the starting index from the end:
    #   rows = table[-1..2]
    #   rows # => [#<CSV::Row "Name":"baz" "Value":"2">]
    #
    # If <tt>range.start</tt> is larger than <tt>table.size</tt>, returns +nil+:
    #   table[4..4] # => nil
    #
    # ---
    #
    # The expression <tt>table[header]</tt>, where +header+ is a \String,
    # returns column values (\Array of \Strings) if the column exists
    # and if the access mode is <tt>:col</tt> or <tt>:col_or_row</tt>:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_col! # => #<CSV::Table mode:col row_count:4>
    #   table['Name'] # => ["foo", "bar", "baz"]
    #   table.by_col_or_row! # => #<CSV::Table mode:col_or_row row_count:4>
    #   col = table['Name']
    #   col # => ["foo", "bar", "baz"]
    #
    # Modifying the returned column values does not modify the table:
    #   col[0] = 'bat'
    #   col # => ["bat", "bar", "baz"]
    #   table['Name'] # => ["foo", "bar", "baz"]
    #
    # Returns an \Array of +nil+ values if there is no such column:
    #   table['Nosuch'] # => [nil, nil, nil]
    def [](index_or_header)
      if @mode == :row or  # by index
         (@mode == :col_or_row and (index_or_header.is_a?(Integer) or index_or_header.is_a?(Range)))
        @table[index_or_header]
      else                 # by header
        @table.map { |row| row[index_or_header] }
      end
    end

    #
    # In the default mixed mode, this method assigns rows for index access and
    # columns for header access. You can force the index association by first
    # calling by_col!() or by_row!().
    #
    # Rows may be set to an Array of values (which will inherit the table's
    # headers()) or a CSV::Row.
    #
    # Columns may be set to a single value, which is copied to each row of the
    # column, or an Array of values. Arrays of values are assigned to rows top
    # to bottom in row major order. Excess values are ignored and if the Array
    # does not have a value for each row the extra rows will receive a +nil+.
    #
    # Assigning to an existing column or row clobbers the data. Assigning to
    # new columns creates them at the right end of the table.
    #
    def []=(index_or_header, value)
      if @mode == :row or  # by index
         (@mode == :col_or_row and index_or_header.is_a? Integer)
        if value.is_a? Array
          @table[index_or_header] = Row.new(headers, value)
        else
          @table[index_or_header] = value
        end
      else                 # set column
        unless index_or_header.is_a? Integer
          index = @headers.index(index_or_header) || @headers.size
          @headers[index] = index_or_header
        end
        if value.is_a? Array  # multiple values
          @table.each_with_index do |row, i|
            if row.header_row?
              row[index_or_header] = index_or_header
            else
              row[index_or_header] = value[i]
            end
          end
        else                  # repeated value
          @table.each do |row|
            if row.header_row?
              row[index_or_header] = index_or_header
            else
              row[index_or_header] = value
            end
          end
        end
      end
    end

    # :call-seq:
    #   table.values_at(*indexes) -> array_of_rows
    #   table.values_at(*headers) -> array_of_columns_data
    #
    # If the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>,
    # and each argument is either an \Integer or a \Range,
    # returns rows.
    # Otherwise, returns columns data.
    #
    # In either case, the returned values are in the order
    # specified by the arguments.  Arguments may be repeated.
    #
    # ---
    #
    # Returns rows as an \Array of \CSV::Row objects.
    #
    # No argument:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.values_at # => []
    #
    # One index:
    #   values = table.values_at(0)
    #   values # => [#<CSV::Row "Name":"foo" "Value":"0">]
    #
    # Two indexes:
    #   values = table.values_at(2, 0)
    #   values # => [#<CSV::Row "Name":"baz" "Value":"2">, #<CSV::Row "Name":"foo" "Value":"0">]
    #
    # One \Range:
    #   values = table.values_at(1..2)
    #   values # => [#<CSV::Row "Name":"bar" "Value":"1">, #<CSV::Row "Name":"baz" "Value":"2">]
    #
    # \Ranges and indexes:
    #   values = table.values_at(0..1, 1..2, 0, 2)
    #   pp values
    # Output:
    #   [#<CSV::Row "Name":"foo" "Value":"0">,
    #    #<CSV::Row "Name":"bar" "Value":"1">,
    #    #<CSV::Row "Name":"bar" "Value":"1">,
    #    #<CSV::Row "Name":"baz" "Value":"2">,
    #    #<CSV::Row "Name":"foo" "Value":"0">,
    #    #<CSV::Row "Name":"baz" "Value":"2">]
    #
    # ---
    #
    # Returns columns data as row Arrays,
    # each consisting of the specified columns data for that row:
    #   values = table.values_at('Name')
    #   values # => [["foo"], ["bar"], ["baz"]]
    #   values = table.values_at('Value', 'Name')
    #   values # => [["0", "foo"], ["1", "bar"], ["2", "baz"]]
    def values_at(*indices_or_headers)
      if @mode == :row or  # by indices
         ( @mode == :col_or_row and indices_or_headers.all? do |index|
                                      index.is_a?(Integer)         or
                                      ( index.is_a?(Range)         and
                                        index.first.is_a?(Integer) and
                                        index.last.is_a?(Integer) )
                                    end )
        @table.values_at(*indices_or_headers)
      else                 # by headers
        @table.map { |row| row.values_at(*indices_or_headers) }
      end
    end

    # :call-seq:
    #   table << row_or_array -> self
    #
    # If +row_or_array+ is a \CSV::Row object,
    # it is appended to the table:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table << CSV::Row.new(table.headers, ['bat', 3])
    #   table[3] # => #<CSV::Row "Name":"bat" "Value":3>
    #
    # If +row_or_array+ is an \Array, it is used to create a new
    # \CSV::Row object which is then appended to the table:
    #   table << ['bam', 4]
    #   table[4] # => #<CSV::Row "Name":"bam" "Value":4>
    def <<(row_or_array)
      if row_or_array.is_a? Array  # append Array
        @table << Row.new(headers, row_or_array)
      else                         # append Row
        @table << row_or_array
      end

      self # for chaining
    end

    #
    # :call-seq:
    #   table.push(*rows_or_arrays) -> self
    #
    # A shortcut for appending multiple rows. Equivalent to:
    #   rows.each {|row| self << row }
    #
    # Each argument may be either a \CSV::Row object or an \Array:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   rows = [
    #     CSV::Row.new(table.headers, ['bat', 3]),
    #     ['bam', 4]
    #   ]
    #   table.push(*rows)
    #   table[3..4] # => [#<CSV::Row "Name":"bat" "Value":3>, #<CSV::Row "Name":"bam" "Value":4>]
    def push(*rows)
      rows.each { |row| self << row }

      self # for chaining
    end

    # :call-seq:
    #   table.delete(*indexes) -> deleted_values
    #   table.delete(*headers) -> deleted_values
    #
    # If the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>,
    # and each argument is either an \Integer or a \Range,
    # returns deleted rows.
    # Otherwise, returns deleted columns data.
    #
    # In either case, the returned values are in the order
    # specified by the arguments.  Arguments may be repeated.
    #
    # ---
    #
    # Returns rows as an \Array of \CSV::Row objects.
    #
    # One index:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   deleted_values = table.delete(0)
    #   deleted_values # => [#<CSV::Row "Name":"foo" "Value":"0">]
    #
    # Two indexes:
    #   table = CSV.parse(source, headers: true)
    #   deleted_values = table.delete(2, 0)
    #   deleted_values # => [#<CSV::Row "Name":"baz" "Value":"2">, #<CSV::Row "Name":"foo" "Value":"0">]
    #
    # ---
    #
    # Returns columns data as column Arrays.
    #
    # One header:
    #   table = CSV.parse(source, headers: true)
    #   deleted_values = table.delete('Name')
    #   deleted_values # => ["foo", "bar", "baz"]
    #
    # Two headers:
    #   table = CSV.parse(source, headers: true)
    #   deleted_values = table.delete('Value', 'Name')
    #   deleted_values # => [["0", "1", "2"], ["foo", "bar", "baz"]]
    def delete(*indexes_or_headers)
      if indexes_or_headers.empty?
        raise ArgumentError, "wrong number of arguments (given 0, expected 1+)"
      end
      deleted_values = indexes_or_headers.map do |index_or_header|
        if @mode == :row or  # by index
            (@mode == :col_or_row and index_or_header.is_a? Integer)
          @table.delete_at(index_or_header)
        else                 # by header
          if index_or_header.is_a? Integer
            @headers.delete_at(index_or_header)
          else
            @headers.delete(index_or_header)
          end
          @table.map { |row| row.delete(index_or_header).last }
        end
      end
      if indexes_or_headers.size == 1
        deleted_values[0]
      else
        deleted_values
      end
    end

    # Removes rows or columns for which the block returns a truthy value;
    # returns +self+.
    #
    # Removes rows when the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>;
    # calls the block with each \CSV::Row object:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_row! # => #<CSV::Table mode:row row_count:4>
    #   table.size # => 3
    #   table.delete_if {|row| row['Name'].start_with?('b') }
    #   table.size # => 1
    #
    # Removes columns when the access mode is <tt>:col</tt>;
    # calls the block with each column as a 2-element array
    # containing the header and an \Array of column fields:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_col! # => #<CSV::Table mode:col row_count:4>
    #   table.headers.size # => 2
    #   table.delete_if {|column_data| column_data[1].include?('2') }
    #   table.headers.size # => 1
    #
    # Returns a new \Enumerator if no block is given:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.delete_if # => #<Enumerator: #<CSV::Table mode:col_or_row row_count:4>:delete_if>
    def delete_if(&block)
      return enum_for(__method__) { @mode == :row or @mode == :col_or_row ? size : headers.size } unless block_given?

      if @mode == :row or @mode == :col_or_row  # by index
        @table.delete_if(&block)
      else                                      # by header
        deleted = []
        headers.each do |header|
          deleted << delete(header) if yield([header, self[header]])
        end
      end

      self # for chaining
    end

    include Enumerable

    # Calls the block with each row or column; returns +self+.
    #
    # When the access mode is <tt>:row</tt> or <tt>:col_or_row</tt>,
    # calls the block with each \CSV::Row object:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   table.by_row! # => #<CSV::Table mode:row row_count:4>
    #   table.each {|row| p row }
    # Output:
    #   #<CSV::Row "Name":"foo" "Value":"0">
    #   #<CSV::Row "Name":"bar" "Value":"1">
    #   #<CSV::Row "Name":"baz" "Value":"2">
    #
    # When the access mode is <tt>:col</tt>,
    # calls the block with each column as a 2-element array
    # containing the header and an \Array of column fields:
    #   table.by_col! # => #<CSV::Table mode:col row_count:4>
    #   table.each {|column_data| p column_data }
    # Output:
    #   ["Name", ["foo", "bar", "baz"]]
    #   ["Value", ["0", "1", "2"]]
    #
    # Returns a new \Enumerator if no block is given:
    #   table.each # => #<Enumerator: #<CSV::Table mode:col row_count:4>:each>
    def each(&block)
      return enum_for(__method__) { @mode == :col ? headers.size : size } unless block_given?

      if @mode == :col
        headers.each { |header| yield([header, self[header]]) }
      else
        @table.each(&block)
      end

      self # for chaining
    end

    # Returns +true+ if all each row of +self+ <tt>==</tt>
    # the corresponding row of +other_table+, otherwise, +false+.
    #
    # The access mode does no affect the result.
    #
    # Equal tables:
    #   source = "Name,Value\nfoo,0\nbar,1\nbaz,2\n"
    #   table = CSV.parse(source, headers: true)
    #   other_table = CSV.parse(source, headers: true)
    #   table == other_table # => true
    #
    # Different row count:
    #   other_table.delete(2)
    #   table == other_table # => false
    #
    # Different last row:
    #   other_table << ['bat', 3]
    #   table == other_table # => false
    def ==(other)
      return @table == other.table if other.is_a? CSV::Table
      @table == other
    end

    #
    # Returns the table as an Array of Arrays. Headers will be the first row,
    # then all of the field rows will follow.
    #
    def to_a
      array = [headers]
      @table.each do |row|
        array.push(row.fields) unless row.header_row?
      end

      array
    end

    #
    # Returns the table as a complete CSV String. Headers will be listed first,
    # then all of the field rows.
    #
    # This method assumes you want the Table.headers(), unless you explicitly
    # pass <tt>:write_headers => false</tt>.
    #
    def to_csv(write_headers: true, **options)
      array = write_headers ? [headers.to_csv(**options)] : []
      @table.each do |row|
        array.push(row.fields.to_csv(**options)) unless row.header_row?
      end

      array.join("")
    end
    alias_method :to_s, :to_csv

    #
    # Extracts the nested value specified by the sequence of +index+ or +header+ objects by calling dig at each step,
    # returning nil if any intermediate step is nil.
    #
    def dig(index_or_header, *index_or_headers)
      value = self[index_or_header]
      if value.nil?
        nil
      elsif index_or_headers.empty?
        value
      else
        unless value.respond_to?(:dig)
          raise TypeError, "#{value.class} does not have \#dig method"
        end
        value.dig(*index_or_headers)
      end
    end

    # Shows the mode and size of this table in a US-ASCII String.
    def inspect
      "#<#{self.class} mode:#{@mode} row_count:#{to_a.size}>".encode("US-ASCII")
    end
  end
end