#include "PPTExtractor.h" #include #include #include #include PPTExtractor::PPTExtractor(const std::string& filePath) : filePath(filePath) {} PPTExtractor::~PPTExtractor() {} bool PPTExtractor::isPPTX() { return filePath.substr(filePath.find_last_of(".") + 1) == "pptx"; } bool PPTExtractor::extractText(std::vector& slidesText) { if (isPPTX()) { return extractTextFromPPTX(slidesText); } else { std::cerr << "Only PPTX files are supported." << std::endl; return false; } } bool PPTExtractor::extractTextFromPPTX(std::vector& slidesText) { int err = 0; zip_t* zip = zip_open(filePath.c_str(), 0, &err); if (!zip) { std::cerr << "Failed to open PPTX file." << std::endl; return false; } for (int i = 1; ; ++i) { std::ostringstream oss; oss << "ppt/slides/slide" << i << ".xml"; std::string slidePath = oss.str(); struct zip_stat st; if (zip_stat(zip, slidePath.c_str(), 0, &st) != 0) { break; // No more slides } zip_file_t* slideFile = zip_fopen(zip, slidePath.c_str(), 0); if (!slideFile) { std::cerr << "Failed to open slide: " << slidePath << std::endl; continue; } char* buffer = new char[st.size]; zip_fread(slideFile, buffer, st.size); zip_fclose(slideFile); pugi::xml_document doc; pugi::xml_parse_result result = doc.load_buffer(buffer, st.size); delete[] buffer; if (!result) { std::cerr << "Failed to parse slide XML: " << slidePath << std::endl; continue; } std::string slideText; for (pugi::xml_node node : doc.select_nodes("//a:t")) { slideText += node.child_value(); slideText += "\n"; } slidesText.push_back(slideText); } zip_close(zip); return true; }