Extract Data from a PDF
The code below shows you how to extract data from PDF files.
Data can be extracted as text (comma, semicolon or tab delimited), unformatted Excel, or as MySQL or MS-SQL compatible statements.
This sample requires a Professional license.
C#
using SolidFramework.Converters; using SolidFramework.Converters.Plumbing; using System; using System.IO; namespace PDFtoData { class Program { [STAThread] static void Main(string[] args) { // Call your Solid Documents License SolidFramework.License.Import(@"C:\MyFolder\license.xml"); // Set the location of your the file you want to convert string pdfPath = @"C:\YourFolder\yourpdf.pdf"; // Set the file extension you are creating string outputPath = Path.ChangeExtension(pdfPath, ".xlsx"); // *PDF to DATA*// using (PdfToDataConverter converter = new PdfToDataConverter()) { // Add files to convert. converter.AddSourceFile(pdfPath); //Set the preferred conversion properties //Convert to CSV //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text; //converter.DelimiterOption = DelimiterOptions.Comma; //Convert to simple unformatted Excel converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel; ////Convert to MySQL //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL; ////Convert to Microsoft SQL //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL; // Set a specific page range to convert converter.PageRange = new SolidFramework.PageRange(new int[] { 1 }); // Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR; //Only with Pro+OCR and OCR license converter.TextRecoveryType = TextRecovery.Automatic; // Show the status of the PDF file in the Console Window SolidFramework.Converters.Plumbing.ConversionStatus status = converter.ConvertTo(outputPath, true); if (status != ConversionStatus.Success) { Console.WriteLine(status); } } Console.ReadKey(); } } }
C++
#include "stdafx.h" #include "SolidFramework.h" #include <iostream> using namespace std; void DoProgress(SolidFramework::ProgressEventArgsPtr args) { // Not implemented } void DoWarning(SolidFramework::WarningEventArgsPtr args) { // Not implemented } class PdfToDataConverter : public SolidFramework::Converters::PdfToDataConverterBase { public: void FireProgress(SolidFramework::ProgressEventArgsPtr args) override { DoProgress(args); }; void FireWarning(SolidFramework::WarningEventArgsPtr args) override { DoWarning(args); }; }; int _tmain(int argc, _TCHAR* argv[]) { /*Enter your Solid Documents License Information Here*/ SolidFramework::License::Import(L"C:\\MyFolder\\license.xml"); PdfToDataConverter *converter = new PdfToDataConverter(); //converter->set SolidFramework::Converters::CustomData *pData = NULL; pData = new SolidFramework::Converters::CustomData(); pData->Converter = converter; pData->Data = nullptr; converter->setCustomData(pData); // Add files to convert and directory to output converted file converter->AddSourceFile(L"C:\\YourFolder\\yourpdf.pdf"); converter->setOutputDirectory(L"C:\\MyFolder"); // Set the options - in this case minimal Excel //converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::Excel); converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::MSSQL); // Convert the file cout << "Starting conversion." << endl; converter->Convert(); SolidFramework::Converters::Plumbing::ConversionStatus status = converter->getResults()->getItem(0)->getStatus(); if (status != SolidFramework::Converters::Plumbing::ConversionStatus::Success) { cout << "Conversion failed." << endl; } else { cout << "Conversion succeeded." << endl; } converter->Dispose(); cout << "Press <Enter> to exit." << endl; cin.get(); return 0; }
VB.Net
Imports SolidFramework.Plumbing Imports SolidFramework.Pdf Imports SolidFramework.Converters Imports System.IO Imports SolidFramework.Converters.Plumbing Module PDFtoData Sub Main() ' Call your Solid Documents License SolidFramework.License.Import("C:\Useful\license.xml") ' Set the location of your the file you want to convert Dim pdfPath As String = "C:\YourFolder\yourpdf.pdf" ' Set the file extension you are creating Dim outputPath As String = Path.ChangeExtension(pdfPath, ".xlsx") ' *PDF to DATA*// Using converter As New PdfToDataConverter() ' Add files to convert. converter.AddSourceFile(pdfPath) 'Set the preferred conversion properties 'Convert to CSV 'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text; 'converter.DelimiterOption = DelimiterOptions.Comma; 'Convert to simple unformatted Excel converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel 'Convert to MySQL 'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL; 'Convert to Microsoft SQL 'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL; ' Set a specific page range to convert converter.PageRange = New SolidFramework.PageRange(New Integer() {1}) ' Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR 'Only with Pro+OCR and OCR license converter.TextRecoveryType = TextRecovery.Automatic ' Show the status of the PDF file in the Console Window Dim status As SolidFramework.Converters.Plumbing.ConversionStatus = converter.ConvertTo(outputPath, True) If status <> ConversionStatus.Success Then Console.WriteLine(status) End If End Using Console.ReadKey() End Sub End Module