You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
2.0 KiB
73 lines
2.0 KiB
#include "PPTExtractor.h"
|
|
#include <zip.h>
|
|
#include <pugixml.hpp>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
|
|
PPTExtractor::PPTExtractor(const std::string& filePath) : filePath(filePath) {}
|
|
|
|
PPTExtractor::~PPTExtractor() {}
|
|
|
|
bool PPTExtractor::isPPTX() {
|
|
return filePath.substr(filePath.find_last_of(".") + 1) == "pptx";
|
|
}
|
|
|
|
bool PPTExtractor::extractText(std::vector<std::string>& slidesText) {
|
|
if (isPPTX()) {
|
|
return extractTextFromPPTX(slidesText);
|
|
}
|
|
else {
|
|
std::cerr << "Only PPTX files are supported." << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool PPTExtractor::extractTextFromPPTX(std::vector<std::string>& slidesText) {
|
|
int err = 0;
|
|
zip_t* zip = zip_open(filePath.c_str(), 0, &err);
|
|
if (!zip) {
|
|
std::cerr << "Failed to open PPTX file." << std::endl;
|
|
return false;
|
|
}
|
|
|
|
for (int i = 1; ; ++i) {
|
|
std::ostringstream oss;
|
|
oss << "ppt/slides/slide" << i << ".xml";
|
|
std::string slidePath = oss.str();
|
|
|
|
struct zip_stat st;
|
|
if (zip_stat(zip, slidePath.c_str(), 0, &st) != 0) {
|
|
break; // No more slides
|
|
}
|
|
|
|
zip_file_t* slideFile = zip_fopen(zip, slidePath.c_str(), 0);
|
|
if (!slideFile) {
|
|
std::cerr << "Failed to open slide: " << slidePath << std::endl;
|
|
continue;
|
|
}
|
|
|
|
char* buffer = new char[st.size];
|
|
zip_fread(slideFile, buffer, st.size);
|
|
zip_fclose(slideFile);
|
|
|
|
pugi::xml_document doc;
|
|
pugi::xml_parse_result result = doc.load_buffer(buffer, st.size);
|
|
delete[] buffer;
|
|
|
|
if (!result) {
|
|
std::cerr << "Failed to parse slide XML: " << slidePath << std::endl;
|
|
continue;
|
|
}
|
|
|
|
std::string slideText;
|
|
for (pugi::xml_node node : doc.select_nodes("//a:t")) {
|
|
slideText += node.child_value();
|
|
slideText += "\n";
|
|
}
|
|
|
|
slidesText.push_back(slideText);
|
|
}
|
|
|
|
zip_close(zip);
|
|
return true;
|
|
} |