2 years ago
#388754
Michael Kopp
How to export data from a C++ object wrapped with pybind11 to a pandas DataFrame with minimal amount of copies?
I have a C++ class Plants that contains data in form of nested/linked/... objects.
This class is accessible in python via pybind11.
I want to access some information of the contained data in "tabular" form and work with it using pandas (in a DataFrame).
Since there will be a lot of flowers, this export should be as efficient as possible, hence I want to avoid unnecessary copies/conversions of the data.
The code below illustrates my first attempts.
Note, that in this example the height is a 32 bit integer.
If I store that in an STL vector, there seems to be a conversion to the python int (which is 64 bit), hence my DataFrame has dtype int64 for that column.
I can "work around that" by using an Eigen vector with int32, and the resulting column really has int32 dtype.
Now I am wondering, how much conversion/copying/... is going on in these cases and whether there is an even better approach.
I was looking at the py::capsule and py::array_t feature and maybe I have to setup one capsule per column and work with raw pointers/arrays to store the data in there and pass that out.
Here is the C++ code; I build it with
c++ -O3 -Wall -shared -std=c++17 -fPIC $(python3 -m pybind11 --includes) -I /usr/include/eigen3 todataframe.cpp -o plants.so
#include <Eigen/Core>
#include <cstddef>
#include <memory>
#include <pybind11/eigen.h> // support returning eigen vectors
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // support returning stl vectors, strings, ...
#include <string>
#include <vector>
namespace py = pybind11;
// the C++ data structures
struct Tree {
  std::string name;
  std::int64_t age;
  std::int32_t height_cm;
};
struct Rose {
  std::int32_t height_cm;
  std::string name;
  std::vector<Tree *> trees_close_by{};
};
struct Plants {
  std::vector<Rose> roses{};
  std::vector<Tree> trees{};
};
// python binding code
PYBIND11_MODULE(plants, m) {
  m.doc() = "Show all available plants";
  py::class_<Plants>(m, "Plants")
      .def(py::init([]() {
             auto plants_ptr = std::make_unique<Plants>();
             plants_ptr->roses.push_back(Rose{13, "Petunia"});
             plants_ptr->roses.push_back(Rose{17, "Alberta"});
             plants_ptr->trees.push_back(Tree{"Joseph", 12, 100});
             plants_ptr->trees.push_back(Tree{"Georg", 2, 12});
             return plants_ptr;
           }),
           py::return_value_policy::take_ownership)
      .def("__repr__",
           [](const Plants &self) {
             return "Plants(num roses: " + std::to_string(self.roses.size()) +
                    ", num trees: " + std::to_string(self.trees.size());
           })
      // This function returns a dict which is suitable for DataFrame to
      // "ingest".
      .def(
          "as_dict",
          [](const Plants &self, bool use_eigen_vectors) {
            const auto number_of_plants = self.roses.size() + self.trees.size();
            std::vector<std::string> names;
            names.reserve(number_of_plants);
            std::vector<std::int32_t> heights;
            heights.reserve(number_of_plants);
            for (const auto &rose : self.roses) {
              names.push_back(rose.name);
              heights.push_back(rose.height_cm);
            }
            for (const auto &tree : self.trees) {
              names.push_back(tree.name);
              heights.push_back(tree.height_cm);
            }
            pybind11::dict data{};
            data["names"] = names;
            // There are two different vector kinds to use -- Eigen and stl.
            if (use_eigen_vectors) {
              // In the Eigen case, make a copy of the data, since the stl
              // vector will delete the data when going out of scope.
              Eigen::Matrix<std::int32_t, Eigen::Dynamic, 1> heights_eigen =
                  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 1>::Map(
                      heights.data(), heights.size());
              data["heights"] = heights_eigen;
            } else {
              data["heights"] = heights;
            }
            return data;
          },
          py::arg("use_eigen_vector") = false);
}
and here is some python code, using this module and creating a DataFrame:
import plants
import pandas as pd
p = plants.Plants()
print("column as stl vector")
df = pd.DataFrame(p.as_dict(use_eigen_vector=False))
print(df.dtypes)
print("column as Eigen vector")
df = pd.DataFrame(p.as_dict(use_eigen_vector=True))
print(df.dtypes)
python
c++
pandas
dataframe
pybind11
0 Answers
Your Answer