Extract Text from a PDF File
Reconstruct text flow order and then extract text from an existing PDF.
This sample requires a Tools license. If you use a trial license then some of the output will be mangled.
C#
using System;
using System.IO;
using SolidFramework.Converters.Plumbing;
using SolidFramework.Converters;
namespace ExtractText
{
class Program
{
[STAThread]
static void Main(string[] args)
{
// Call your Solid Documents License
SolidFramework.License.Import(@"C:\MyFolder\license.xml");
// Set the location of your the file you want to convert
String pdfPath = @"C:\YourFolder\yourpdf.pdf";
// Set the file extension you are creating
String txtPath = Path.ChangeExtension(pdfPath, ".txt");
//*Extract PDF Text*//
using (PdfToTextConverter converter = new PdfToTextConverter())
{
// Add the selected file
converter.AddSourceFile(pdfPath);
// Force the file to be overwritten
converter.OverwriteMode = SolidFramework.Plumbing.OverwriteMode.ForceOverwrite;
// Select where to save the file
converter.OutputDirectory = @"C:\MyFolder\";
// Convert the File, Saving it as the same name but with the extension .txt
converter.Convert();
//Show the status of the PDF file in the Console Window
SolidFramework.Converters.Plumbing.ConversionStatus status = converter.ConvertTo(txtPath, true);
if (status != ConversionStatus.Success)
{
Console.WriteLine(status);
Console.ReadKey();
}
}
}
}
}
C++
#include "stdafx.h"
#include "SolidFramework.h"
#include <iostream>
using namespace std;
void DoProgress(SolidFramework::ProgressEventArgsPtr args)
{
// Not implemented
}
void DoWarning(SolidFramework::WarningEventArgsPtr args)
{
// Not implemented
}
class PdfToTextConverter : public SolidFramework::Converters::PdfToTextConverterBase
{
public:
void FireProgress(SolidFramework::ProgressEventArgsPtr args) override { DoProgress(args); };
void FireWarning(SolidFramework::WarningEventArgsPtr args) override { DoWarning(args); };
};
int _tmain(int argc, _TCHAR* argv[])
{
// Enter your License code goes here
SolidFramework::License::Import(L"C:\\MyFolder\\license.xml");
// Create a PDF to Word Converter called converter
PdfToTextConverter *converter = new PdfToTextConverter();
SolidFramework::Converters::CustomData *pData = NULL;
pData = new SolidFramework::Converters::CustomData();
pData->Converter = converter;
pData->Data = nullptr;
converter->setCustomData(pData);
// Add the PDF file to convert.
converter->AddSourceFile(L"C:\\YourFolder\\yourpdf.pdf");
converter->setOutputDirectory(L"C:\\MyFolder");
// Force the file to be overwritten
converter->setOverwriteMode(SolidFramework::Plumbing::OverwriteMode::ForceOverwrite);
//Start the Conversion
cout << "Starting conversion." << endl; converter->Convert();
SolidFramework::Converters::Plumbing::ConversionStatus status = converter->getResults()->getItem(0)->getStatus();
if (status != SolidFramework::Converters::Plumbing::ConversionStatus::Success)
{
cout << "Conversion failed." << endl;
}
else
{
cout << "Conversion succeeded." << endl; } converter->Dispose();
cout << "Press <Enter> to exit." << endl;
cin.get();
return 0;
}
VB.Net
Imports System.IO
Module ExtractText
Sub Main()
' Call your Solid Documents License
SolidFramework.License.Import("C:\MyFolder\license.xml")
'Define a variable for your source file
Dim sPdfPath As String
' Define a String for the output file
Dim txtPath As String
' Define your Solid Framework Converter
Dim myConverter As SolidFramework.Converters.PdfToTextConverter
' Set your file path
sPdfPath = "C:\YourFolder\filepdf.pdf"
' Set your output file
txtPath = Path.ChangeExtension(sPdfPath, ".txt")
' Set the converter
myConverter = New SolidFramework.Converters.PdfToTextConverter
'Set the preferred conversion properties
' Add files to convert.
myConverter.AddSourceFile(sPdfPath)
' Select where to save the file
myConverter.OutputDirectory = "C:\MyFolder\"
' Convert the File
myConverter.ConvertTo(txtPath, True)
' Clean up
myConverter.Dispose()
End Sub
End Module
