Extract Data from a PDF
The code below shows you how to extract data from PDF files.
Data can be extracted as text (comma, semicolon or tab delimited), unformatted Excel, or as MySQL or MS-SQL compatible statements.
This sample requires a Professional license.
C#
using SolidFramework.Converters;
using SolidFramework.Converters.Plumbing;
using System;
using System.IO;
namespace PDFtoData
{
class Program
{
[STAThread]
static void Main(string[] args)
{
// Call your Solid Documents License
SolidFramework.License.Import(@"C:\MyFolder\license.xml");
// Set the location of your the file you want to convert
string pdfPath = @"C:\YourFolder\yourpdf.pdf";
// Set the file extension you are creating
string outputPath = Path.ChangeExtension(pdfPath, ".xlsx");
// *PDF to DATA*//
using (PdfToDataConverter converter = new PdfToDataConverter())
{
// Add files to convert.
converter.AddSourceFile(pdfPath);
//Set the preferred conversion properties
//Convert to CSV
//converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text;
//converter.DelimiterOption = DelimiterOptions.Comma;
//Convert to simple unformatted Excel
converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel;
////Convert to MySQL
//converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL;
////Convert to Microsoft SQL
//converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL;
// Set a specific page range to convert
converter.PageRange = new SolidFramework.PageRange(new int[] { 1 });
// Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files
converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR; //Only with Pro+OCR and OCR license
converter.TextRecoveryType = TextRecovery.Automatic;
// Show the status of the PDF file in the Console Window
SolidFramework.Converters.Plumbing.ConversionStatus status = converter.ConvertTo(outputPath, true);
if (status != ConversionStatus.Success)
{
Console.WriteLine(status);
}
}
Console.ReadKey();
}
}
}
C++
#include "stdafx.h"
#include "SolidFramework.h"
#include <iostream>
using namespace std;
void DoProgress(SolidFramework::ProgressEventArgsPtr args)
{
// Not implemented
}
void DoWarning(SolidFramework::WarningEventArgsPtr args)
{
// Not implemented
}
class PdfToDataConverter : public SolidFramework::Converters::PdfToDataConverterBase
{
public:
void FireProgress(SolidFramework::ProgressEventArgsPtr args) override { DoProgress(args); };
void FireWarning(SolidFramework::WarningEventArgsPtr args) override { DoWarning(args); };
};
int _tmain(int argc, _TCHAR* argv[])
{
/*Enter your Solid Documents License Information Here*/
SolidFramework::License::Import(L"C:\\MyFolder\\license.xml");
PdfToDataConverter *converter = new PdfToDataConverter();
//converter->set
SolidFramework::Converters::CustomData *pData = NULL;
pData = new SolidFramework::Converters::CustomData();
pData->Converter = converter;
pData->Data = nullptr;
converter->setCustomData(pData);
// Add files to convert and directory to output converted file
converter->AddSourceFile(L"C:\\YourFolder\\yourpdf.pdf");
converter->setOutputDirectory(L"C:\\MyFolder");
// Set the options - in this case minimal Excel
//converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::Excel);
converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::MSSQL);
// Convert the file
cout << "Starting conversion." << endl; converter->Convert();
SolidFramework::Converters::Plumbing::ConversionStatus status = converter->getResults()->getItem(0)->getStatus();
if (status != SolidFramework::Converters::Plumbing::ConversionStatus::Success)
{
cout << "Conversion failed." << endl;
}
else
{
cout << "Conversion succeeded." << endl; } converter->Dispose();
cout << "Press <Enter> to exit." << endl;
cin.get();
return 0;
}
VB.Net
Imports SolidFramework.Plumbing
Imports SolidFramework.Pdf
Imports SolidFramework.Converters
Imports System.IO
Imports SolidFramework.Converters.Plumbing
Module PDFtoData
Sub Main()
' Call your Solid Documents License
SolidFramework.License.Import("C:\Useful\license.xml")
' Set the location of your the file you want to convert
Dim pdfPath As String = "C:\YourFolder\yourpdf.pdf"
' Set the file extension you are creating
Dim outputPath As String = Path.ChangeExtension(pdfPath, ".xlsx")
' *PDF to DATA*//
Using converter As New PdfToDataConverter()
' Add files to convert.
converter.AddSourceFile(pdfPath)
'Set the preferred conversion properties
'Convert to CSV
'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text;
'converter.DelimiterOption = DelimiterOptions.Comma;
'Convert to simple unformatted Excel
converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel
'Convert to MySQL
'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL;
'Convert to Microsoft SQL
'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL;
' Set a specific page range to convert
converter.PageRange = New SolidFramework.PageRange(New Integer() {1})
' Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files
converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR
'Only with Pro+OCR and OCR license
converter.TextRecoveryType = TextRecovery.Automatic
' Show the status of the PDF file in the Console Window
Dim status As SolidFramework.Converters.Plumbing.ConversionStatus = converter.ConvertTo(outputPath, True)
If status <> ConversionStatus.Success Then
Console.WriteLine(status)
End If
End Using
Console.ReadKey()
End Sub
End Module
