Extract Text from a PDF File
Reconstruct text flow order and then extract text from an existing PDF.
This sample requires a Tools license. If you use a trial license then some of the output will be mangled.
C#
using System; using System.IO; using SolidFramework.Converters.Plumbing; using SolidFramework.Converters; namespace ExtractText { class Program { [STAThread] static void Main(string[] args) { // Call your Solid Documents License SolidFramework.License.Import(@"C:\MyFolder\license.xml"); // Set the location of your the file you want to convert String pdfPath = @"C:\YourFolder\yourpdf.pdf"; // Set the file extension you are creating String txtPath = Path.ChangeExtension(pdfPath, ".txt"); //*Extract PDF Text*// using (PdfToTextConverter converter = new PdfToTextConverter()) { // Add the selected file converter.AddSourceFile(pdfPath); // Force the file to be overwritten converter.OverwriteMode = SolidFramework.Plumbing.OverwriteMode.ForceOverwrite; // Select where to save the file converter.OutputDirectory = @"C:\MyFolder\"; // Convert the File, Saving it as the same name but with the extension .txt converter.Convert(); //Show the status of the PDF file in the Console Window SolidFramework.Converters.Plumbing.ConversionStatus status = converter.ConvertTo(txtPath, true); if (status != ConversionStatus.Success) { Console.WriteLine(status); Console.ReadKey(); } } } } }
C++
#include "stdafx.h" #include "SolidFramework.h" #include <iostream> using namespace std; void DoProgress(SolidFramework::ProgressEventArgsPtr args) { // Not implemented } void DoWarning(SolidFramework::WarningEventArgsPtr args) { // Not implemented } class PdfToTextConverter : public SolidFramework::Converters::PdfToTextConverterBase { public: void FireProgress(SolidFramework::ProgressEventArgsPtr args) override { DoProgress(args); }; void FireWarning(SolidFramework::WarningEventArgsPtr args) override { DoWarning(args); }; }; int _tmain(int argc, _TCHAR* argv[]) { // Enter your License code goes here SolidFramework::License::Import(L"C:\\MyFolder\\license.xml"); // Create a PDF to Word Converter called converter PdfToTextConverter *converter = new PdfToTextConverter(); SolidFramework::Converters::CustomData *pData = NULL; pData = new SolidFramework::Converters::CustomData(); pData->Converter = converter; pData->Data = nullptr; converter->setCustomData(pData); // Add the PDF file to convert. converter->AddSourceFile(L"C:\\YourFolder\\yourpdf.pdf"); converter->setOutputDirectory(L"C:\\MyFolder"); // Force the file to be overwritten converter->setOverwriteMode(SolidFramework::Plumbing::OverwriteMode::ForceOverwrite); //Start the Conversion cout << "Starting conversion." << endl; converter->Convert(); SolidFramework::Converters::Plumbing::ConversionStatus status = converter->getResults()->getItem(0)->getStatus(); if (status != SolidFramework::Converters::Plumbing::ConversionStatus::Success) { cout << "Conversion failed." << endl; } else { cout << "Conversion succeeded." << endl; } converter->Dispose(); cout << "Press <Enter> to exit." << endl; cin.get(); return 0; }
VB.Net
Imports System.IO Module ExtractText Sub Main() ' Call your Solid Documents License SolidFramework.License.Import("C:\MyFolder\license.xml") 'Define a variable for your source file Dim sPdfPath As String ' Define a String for the output file Dim txtPath As String ' Define your Solid Framework Converter Dim myConverter As SolidFramework.Converters.PdfToTextConverter ' Set your file path sPdfPath = "C:\YourFolder\filepdf.pdf" ' Set your output file txtPath = Path.ChangeExtension(sPdfPath, ".txt") ' Set the converter myConverter = New SolidFramework.Converters.PdfToTextConverter 'Set the preferred conversion properties ' Add files to convert. myConverter.AddSourceFile(sPdfPath) ' Select where to save the file myConverter.OutputDirectory = "C:\MyFolder\" ' Convert the File myConverter.ConvertTo(txtPath, True) ' Clean up myConverter.Dispose() End Sub End Module