Uploaded image for project: 'Parquet'
  1. Parquet
  2. PARQUET-2095

[C++] Read Parquet file with MapArray

    XMLWordPrintableJSON

Details

    • New Feature
    • Status: Resolved
    • Blocker
    • Resolution: Not A Problem
    • cpp-4.0.0
    • cpp-6.0.0
    • parquet-cpp
    • arrow-apache-arrow-3.0.0
      C++ library

      Linux operating system

    • Important

    Description

      Parquet format can reduce storage space effectively, and we use the format with hdfs+Hive Jni(call c++)Spark Jni(call c), and it works well. Now we are starting a new project only use c+ language with higher performance expectation, but we meet a blocking issue on how to read the parquet file with MapArray such as 

      list<array_element: map<string, list<array_element: int64>>>

      list<array_element: map<string, string>>

      map<string, list<array_element: int64>>

       

      And I know how to  work well only without map struct such as 

      list<array_element: string>, list<array_element: list<array_element: string>>

      Here is the code example, please give me some advice on how to read parquet file with map type, thanks a lot!

       

      // code placeholder
      #include "gflags/gflags.h"
      #include "arrow/api.h"
      #include "arrow/array/builder_base.h"
      #include "arrow/filesystem/hdfs.h"
      #include "arrow/io/api.h"
      #include "parquet/arrow/reader.h"
      #include "parquet/column_reader.h"
      #include "parquet/exception.h"
      #include "parquet/arrow/reader.h"
      
      int main(int argc, char** argv) {
          gflags::ParseCommandLineFlags(&argc, &argv, true);
          arrow::Status st;
          arrow::MemoryPool* pool = ::arrow::default_memory_pool();
          std::shared_ptr<arrow::io::RandomAccessFile> input = nullptr;
          std::shared_ptr<::arrow::io::RandomAccessFile> _infile;
          PARQUET_ASSIGN_OR_THROW(
                          _infile,
                          ::arrow::io::ReadableFile::Open(FLAGS_input_file,
                                  ::arrow::default_memory_pool()));
          // Open Parquet file reader
          std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
          st = parquet::arrow::OpenFile(_infile, pool, &arrow_reader);
          if (!st.ok()) {
                  LOG(ERROR) << "open file failed " << FLAGS_input_file;
                  return 0;
          }
      
      
          // Read entire file as a single Arrow table
          std::shared_ptr<arrow::Table> table;
          st = arrow_reader->ReadTable(&table);
          if (!st.ok()) {
                  LOG(INFO) << "read file to table successfully " << FLAGS_input_file;
          }   
           
          size_t num_cols = table->num_columns();
          for (size_t idx = 0; idx < num_cols; idx++) {
            auto this_field = table->field(idx);
            auto this_column = table->column(idx);
          if (this_field->name() == "lls_column") { // works well type: list<array_element: list<array_element: string>>
              for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
                auto row_array =
                  std::static_pointer_cast<arrow::ListArray>(this_column->chunk(c_idx));
                auto sample_array =
                  std::static_pointer_cast<arrow::ListArray>(row_array->values());
                auto id_array =
                  std::static_pointer_cast<arrow::StringArray>(sample_array->values());
                for (int64_t i = 0; i < table->num_rows(); i++) {
                  auto offset = row_array->value_offset(i);
                  auto count = row_array->value_length(i);
                  for (auto x = 0; x < count; x++) {
                    std::vector<std::string> result;
                    auto sample_offset = sample_array->value_offset(offset+x);
                    auto id_count = sample_array->value_length(offset+x);
                    for (auto id = 0; id < id_count; id++) {
                      int32_t len;
                      const uint8_t* addr = id_array->GetValue(sample_offset + id, &len);
                      result.push_back(std::string(reinterpret_cast<const char*>(addr), (int16_t)len));
                    }
                    LOG(INFO) << "LLS " << count << " " << this_field->name() << " " << to_string(result); // works well
                  }
                }
              }
            }
            else if (this_field->name() == "ms2li_column") { // MS2LI type: map<string, list<array_element: int64>> 
              LOG(INFO)  << "col name: " << this_field->name() << " type: " << this_field->type()->ToString();
              LOG(INFO)  << "length: " << this_column->length() << " chunk num: " << this_column->num_chunks();
              for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) {
                auto row_array =
                  std::static_pointer_cast<arrow::MapArray>(this_column->chunk(c_idx));
                auto keys_array =
                  std::static_pointer_cast<arrow::StringArray>(row_array->keys());
                auto item_array =
                  std::static_pointer_cast<arrow::ListArray>(row_array->items());
                auto item_value_array =
                  std::static_pointer_cast<arrow::ListArray>(item_array->values());
                auto id_array =
                  std::static_pointer_cast<arrow::Int64Array>(item_value_array->values());
              // I've no idea how to traverse the map<string, list<array_element: int64>> to get key and value correctly, 
             }
            }
          }
      
      

      It seems that arrow::MayArray :: keys() and items() lose each map pair's offset, and cannot find the right pair in list<array_element: map<string, string>> format. Really need and appreciate your help.

      Attachments

        1. image-2021-09-26-20-36-27-621.png
          105 kB
          jiang,longshan

        Activity

          People

            Unassigned Unassigned
            longshanpdd jiang,longshan
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: