Extract Text from a PDF
Reconstruct text flow order and then extract text from an existing PDF.
It is recommended that you have already reviewed the Getting Started sample, since that includes Licensing and Framework initialization code required to make this sample run.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
using System;
using SolidFramework.Converters;
using SolidFramework.Converters.Plumbing;
namespace CSharp_Tutorials
{
public static partial class Tutorials
{
public static bool ExtractPdfText(string pdfPath, string outputPath)
{
// Create a PdfToTextConverter
using (var converter = new PdfToTextConverter())
{
// Add the PDF file to convert
converter.AddSourceFile(pdfPath);
// Optional: Set PdfToTextConverter options to your liking
converter.DetectAndRemoveHeadersAndFooters = true;
converter.KeepLineBreaks = true;
converter.LineTerminator = LineTerminator.Windows;
// Optional: Add a progress/warning handler
double nextPercentToLog = 25;
converter.Progress += (sender, progress) =>
{
double percent = progress.Progress * 100.0 / progress.MaxProgress;
if (percent < nextPercentToLog || (percent > 70 && nextPercentToLog < 30)) { return; }
Console.WriteLine(progress.StatusDescription + " " + percent + "%");
nextPercentToLog = percent > 70 ? 25 : nextPercentToLog + 25;
};
Console.WriteLine("Converting " + pdfPath + " to " + outputPath);
// Convert the file
var result = converter.ConvertTo(outputPath, true);
// Check if it was successful
if (result != ConversionStatus.Success)
{
Console.WriteLine("Converting " + pdfPath + " to " + outputPath + " failed with status: " + result);
Console.WriteLine();
return false;
}
}
Console.WriteLine("Successfully converted " + pdfPath + " to " + outputPath);
Console.WriteLine();
return true;
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
Imports System
Imports SolidFramework.Converters
Imports SolidFramework.Converters.Plumbing
Namespace VBNet_Tutorials
Partial Module Tutorials
Function ExtractPdfText(ByVal pdfPath As String, ByVal outputPath As String) As Boolean
' Create a PdfToTextConverter
Using converter As PdfToTextConverter = New PdfToTextConverter()
' Add the PDF file to convert
converter.AddSourceFile(pdfPath)
' Optional Set PdfToTextConverter options to your liking
converter.DetectAndRemoveHeadersAndFooters = True
converter.KeepLineBreaks = True
converter.LineTerminator = LineTerminator.Windows
' Optional Add a progress/warning handler
Dim nextPercentToLog As Double = 25
AddHandler converter.Progress,
Sub(sender, progress)
Dim percent As Double = progress.Progress * 100.0 / progress.MaxProgress
If percent < nextPercentToLog OrElse (percent > 70 AndAlso nextPercentToLog < 30) Then
Return
End If
Console.WriteLine(progress.StatusDescription & " " + percent.ToString() & "%")
nextPercentToLog = If(percent > 70, 25, nextPercentToLog + 25)
End Sub
Console.WriteLine("Converting " & pdfPath & " to " & outputPath)
' Convert the file
Dim result As ConversionStatus = converter.ConvertTo(outputPath, True)
' Check if it was successful
If result <> ConversionStatus.Success Then
Console.WriteLine("Converting " & pdfPath & " to " & outputPath & " failed with status: " & result)
Console.WriteLine()
Return False
End If
End Using
Console.WriteLine("Successfully converted " & pdfPath & " to " & outputPath)
Console.WriteLine()
Return True
End Function
End Module
End Namespace
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include "Tutorials.h"
using namespace SolidFramework::Converters::Plumbing;
using namespace SolidFramework::Converters;
bool ExtractPdfText(const wstring & pdfPath, const wstring & outputPath)
{
// Create a PdfToTextConverter
auto converter = make_shared<PdfToTextConverter>();
// Add the PDF file to convert
converter->AddSourceFile(pdfPath);
// Optional: Set PdfToTextConverter options to your liking
converter->SetDetectAndRemoveHeadersAndFooters(true);
converter->SetKeepLineBreaks(true);
converter->SetLineTerminator(LineTerminator::Windows);
// Optional: Add a progress/warning handler
double nextPercentToLog = 25;
converter->OnProgress = [&nextPercentToLog] (SolidFramework::ProgressEventArgsPtr progress)
{
const double percent = progress->GetProgress() * 100.0 / progress->GetMaxProgress();
if (percent < nextPercentToLog || (percent > 70 && nextPercentToLog < 30)) { return; }
wcout << progress->GetStatusDescription() << L" " << percent << L"%" << endl;
nextPercentToLog = percent > 70 ? 25 : nextPercentToLog + 25;
};
wcout << L"Converting " << pdfPath << L" to " << outputPath << endl;
// Convert the file
auto result = converter->ConvertTo(outputPath, true);
// Check if it was successful
if (result != ConversionStatus::Success)
{
wcout << L"Converting " << pdfPath << L" to " << outputPath << L" failed with status: " << (int)result << endl << endl;
return false;
}
wcout << L"Successfully converted " << pdfPath << L" to " << outputPath << endl << endl;
return true;
}