Details
-
New Feature
-
Status: Resolved
-
Blocker
-
Resolution: Not A Problem
-
cpp-4.0.0
-
arrow-apache-arrow-3.0.0
C++ libraryLinux operating system
-
Important
Description
Parquet format can reduce storage space effectively, and we use the format with hdfs+Hive Jni(call c++)Spark Jni(call c), and it works well. Now we are starting a new project only use c+ language with higher performance expectation, but we meet a blocking issue on how to read the parquet file with MapArray such as
list<array_element: map<string, list<array_element: int64>>>
list<array_element: map<string, string>>
map<string, list<array_element: int64>>
And I know how to work well only without map struct such as
list<array_element: string>, list<array_element: list<array_element: string>>
Here is the code example, please give me some advice on how to read parquet file with map type, thanks a lot!
// code placeholder #include "gflags/gflags.h" #include "arrow/api.h" #include "arrow/array/builder_base.h" #include "arrow/filesystem/hdfs.h" #include "arrow/io/api.h" #include "parquet/arrow/reader.h" #include "parquet/column_reader.h" #include "parquet/exception.h" #include "parquet/arrow/reader.h" int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); arrow::Status st; arrow::MemoryPool* pool = ::arrow::default_memory_pool(); std::shared_ptr<arrow::io::RandomAccessFile> input = nullptr; std::shared_ptr<::arrow::io::RandomAccessFile> _infile; PARQUET_ASSIGN_OR_THROW( _infile, ::arrow::io::ReadableFile::Open(FLAGS_input_file, ::arrow::default_memory_pool())); // Open Parquet file reader std::unique_ptr<parquet::arrow::FileReader> arrow_reader; st = parquet::arrow::OpenFile(_infile, pool, &arrow_reader); if (!st.ok()) { LOG(ERROR) << "open file failed " << FLAGS_input_file; return 0; } // Read entire file as a single Arrow table std::shared_ptr<arrow::Table> table; st = arrow_reader->ReadTable(&table); if (!st.ok()) { LOG(INFO) << "read file to table successfully " << FLAGS_input_file; } size_t num_cols = table->num_columns(); for (size_t idx = 0; idx < num_cols; idx++) { auto this_field = table->field(idx); auto this_column = table->column(idx); if (this_field->name() == "lls_column") { // works well type: list<array_element: list<array_element: string>> for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) { auto row_array = std::static_pointer_cast<arrow::ListArray>(this_column->chunk(c_idx)); auto sample_array = std::static_pointer_cast<arrow::ListArray>(row_array->values()); auto id_array = std::static_pointer_cast<arrow::StringArray>(sample_array->values()); for (int64_t i = 0; i < table->num_rows(); i++) { auto offset = row_array->value_offset(i); auto count = row_array->value_length(i); for (auto x = 0; x < count; x++) { std::vector<std::string> result; auto sample_offset = sample_array->value_offset(offset+x); auto id_count = sample_array->value_length(offset+x); for (auto id = 0; id < id_count; id++) { int32_t len; const uint8_t* addr = id_array->GetValue(sample_offset + id, &len); result.push_back(std::string(reinterpret_cast<const char*>(addr), (int16_t)len)); } LOG(INFO) << "LLS " << count << " " << this_field->name() << " " << to_string(result); // works well } } } } else if (this_field->name() == "ms2li_column") { // MS2LI type: map<string, list<array_element: int64>> LOG(INFO) << "col name: " << this_field->name() << " type: " << this_field->type()->ToString(); LOG(INFO) << "length: " << this_column->length() << " chunk num: " << this_column->num_chunks(); for (size_t c_idx = 0; c_idx < this_column->num_chunks(); c_idx++) { auto row_array = std::static_pointer_cast<arrow::MapArray>(this_column->chunk(c_idx)); auto keys_array = std::static_pointer_cast<arrow::StringArray>(row_array->keys()); auto item_array = std::static_pointer_cast<arrow::ListArray>(row_array->items()); auto item_value_array = std::static_pointer_cast<arrow::ListArray>(item_array->values()); auto id_array = std::static_pointer_cast<arrow::Int64Array>(item_value_array->values()); // I've no idea how to traverse the map<string, list<array_element: int64>> to get key and value correctly, } } }
It seems that arrow::MayArray :: keys() and items() lose each map pair's offset, and cannot find the right pair in list<array_element: map<string, string>> format. Really need and appreciate your help.