1 year ago
#388754
Michael Kopp
How to export data from a C++ object wrapped with pybind11 to a pandas DataFrame with minimal amount of copies?
I have a C++ class Plants
that contains data in form of nested/linked/... objects.
This class is accessible in python via pybind11
.
I want to access some information of the contained data in "tabular" form and work with it using pandas
(in a DataFrame
).
Since there will be a lot of flowers, this export should be as efficient as possible, hence I want to avoid unnecessary copies/conversions of the data.
The code below illustrates my first attempts.
Note, that in this example the height is a 32 bit integer.
If I store that in an STL vector, there seems to be a conversion to the python int
(which is 64 bit), hence my DataFrame has dtype int64
for that column.
I can "work around that" by using an Eigen vector with int32
, and the resulting column really has int32
dtype.
Now I am wondering, how much conversion/copying/... is going on in these cases and whether there is an even better approach.
I was looking at the py::capsule
and py::array_t
feature and maybe I have to setup one capsule per column and work with raw pointers/arrays to store the data in there and pass that out.
Here is the C++ code; I build it with
c++ -O3 -Wall -shared -std=c++17 -fPIC $(python3 -m pybind11 --includes) -I /usr/include/eigen3 todataframe.cpp -o plants.so
#include <Eigen/Core>
#include <cstddef>
#include <memory>
#include <pybind11/eigen.h> // support returning eigen vectors
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // support returning stl vectors, strings, ...
#include <string>
#include <vector>
namespace py = pybind11;
// the C++ data structures
struct Tree {
std::string name;
std::int64_t age;
std::int32_t height_cm;
};
struct Rose {
std::int32_t height_cm;
std::string name;
std::vector<Tree *> trees_close_by{};
};
struct Plants {
std::vector<Rose> roses{};
std::vector<Tree> trees{};
};
// python binding code
PYBIND11_MODULE(plants, m) {
m.doc() = "Show all available plants";
py::class_<Plants>(m, "Plants")
.def(py::init([]() {
auto plants_ptr = std::make_unique<Plants>();
plants_ptr->roses.push_back(Rose{13, "Petunia"});
plants_ptr->roses.push_back(Rose{17, "Alberta"});
plants_ptr->trees.push_back(Tree{"Joseph", 12, 100});
plants_ptr->trees.push_back(Tree{"Georg", 2, 12});
return plants_ptr;
}),
py::return_value_policy::take_ownership)
.def("__repr__",
[](const Plants &self) {
return "Plants(num roses: " + std::to_string(self.roses.size()) +
", num trees: " + std::to_string(self.trees.size());
})
// This function returns a dict which is suitable for DataFrame to
// "ingest".
.def(
"as_dict",
[](const Plants &self, bool use_eigen_vectors) {
const auto number_of_plants = self.roses.size() + self.trees.size();
std::vector<std::string> names;
names.reserve(number_of_plants);
std::vector<std::int32_t> heights;
heights.reserve(number_of_plants);
for (const auto &rose : self.roses) {
names.push_back(rose.name);
heights.push_back(rose.height_cm);
}
for (const auto &tree : self.trees) {
names.push_back(tree.name);
heights.push_back(tree.height_cm);
}
pybind11::dict data{};
data["names"] = names;
// There are two different vector kinds to use -- Eigen and stl.
if (use_eigen_vectors) {
// In the Eigen case, make a copy of the data, since the stl
// vector will delete the data when going out of scope.
Eigen::Matrix<std::int32_t, Eigen::Dynamic, 1> heights_eigen =
Eigen::Matrix<std::int32_t, Eigen::Dynamic, 1>::Map(
heights.data(), heights.size());
data["heights"] = heights_eigen;
} else {
data["heights"] = heights;
}
return data;
},
py::arg("use_eigen_vector") = false);
}
and here is some python code, using this module and creating a DataFrame:
import plants
import pandas as pd
p = plants.Plants()
print("column as stl vector")
df = pd.DataFrame(p.as_dict(use_eigen_vector=False))
print(df.dtypes)
print("column as Eigen vector")
df = pd.DataFrame(p.as_dict(use_eigen_vector=True))
print(df.dtypes)
python
c++
pandas
dataframe
pybind11
0 Answers
Your Answer