Extract Data from a PDF

The code below shows you how to extract data from PDF files.

Data can be extracted as text (comma, semicolon or tab delimited), unformatted Excel, or as MySQL or MS-SQL compatible statements.

This sample requires a Professional license.

C#
using SolidFramework.Converters;
using SolidFramework.Converters.Plumbing;
using System;
using System.IO;

namespace PDFtoData
{
    class Program
    {
        [STAThread]
        static void Main(string[] args)
        {
            // Call your Solid Documents License
            SolidFramework.License.Import(@"C:\MyFolder\license.xml");

            // Set the location of your the file you want to convert
            string pdfPath = @"C:\YourFolder\yourpdf.pdf";

            // Set the file extension you are creating
            string outputPath = Path.ChangeExtension(pdfPath, ".xlsx");

            // *PDF to DATA*//  
            using (PdfToDataConverter converter = new PdfToDataConverter())
            {
                // Add files to convert. 
                converter.AddSourceFile(pdfPath);

                //Set the preferred conversion properties 

                //Convert to CSV
                //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text;
                //converter.DelimiterOption = DelimiterOptions.Comma;

                //Convert to simple unformatted Excel
                converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel;

                ////Convert to MySQL
                //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL;

                ////Convert to Microsoft SQL
                //converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL;

                // Set a specific page range to convert
                converter.PageRange = new SolidFramework.PageRange(new int[] { 1 });

                // Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files
                converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR; //Only with Pro+OCR and OCR license
                converter.TextRecoveryType = TextRecovery.Automatic;

                // Show the status of the PDF file in the Console Window
                SolidFramework.Converters.Plumbing.ConversionStatus status = converter.ConvertTo(outputPath, true);
                if (status != ConversionStatus.Success)
                {
                    Console.WriteLine(status);
                }
            }

            Console.ReadKey();
        }
    }
}

C++
#include "stdafx.h"
#include "SolidFramework.h"

#include <iostream>
using  namespace std;

void DoProgress(SolidFramework::ProgressEventArgsPtr args)
{
    // Not implemented
}

void DoWarning(SolidFramework::WarningEventArgsPtr args)
{
    // Not implemented
}

class PdfToDataConverter : public SolidFramework::Converters::PdfToDataConverterBase
{
public:
    void FireProgress(SolidFramework::ProgressEventArgsPtr args) override { DoProgress(args); };
    void FireWarning(SolidFramework::WarningEventArgsPtr args) override { DoWarning(args); };
};

int _tmain(int argc, _TCHAR* argv[])
{

    /*Enter your Solid Documents License Information Here*/
    SolidFramework::License::Import(L"C:\\MyFolder\\license.xml");

    PdfToDataConverter *converter = new PdfToDataConverter();

    //converter->set
    SolidFramework::Converters::CustomData *pData = NULL;
    pData = new SolidFramework::Converters::CustomData();
    pData->Converter = converter;
    pData->Data = nullptr;
    converter->setCustomData(pData);

    // Add files to convert and directory to output converted file
    converter->AddSourceFile(L"C:\\YourFolder\\yourpdf.pdf");
    converter->setOutputDirectory(L"C:\\MyFolder");

    // Set the options - in this case minimal Excel
    //converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::Excel);

    converter->setExportFormat(SolidFramework::Converters::Plumbing::DataExportFormat::MSSQL);
    // Convert the file
    cout << "Starting conversion." << endl; converter->Convert();

    SolidFramework::Converters::Plumbing::ConversionStatus status = converter->getResults()->getItem(0)->getStatus();
    if (status != SolidFramework::Converters::Plumbing::ConversionStatus::Success)
    {
        cout << "Conversion failed." << endl;
    }
    else
    {
        cout << "Conversion succeeded." << endl; } converter->Dispose();

    cout << "Press <Enter> to exit." << endl;
    cin.get();

    return 0;
}

VB.Net
Imports SolidFramework.Plumbing
Imports SolidFramework.Pdf
Imports SolidFramework.Converters
Imports System.IO
Imports SolidFramework.Converters.Plumbing

Module PDFtoData

    Sub Main()
        ' Call your Solid Documents License
        SolidFramework.License.Import("C:\Useful\license.xml")

        ' Set the location of your the file you want to convert
        Dim pdfPath As String = "C:\YourFolder\yourpdf.pdf"

        ' Set the file extension you are creating
        Dim outputPath As String = Path.ChangeExtension(pdfPath, ".xlsx")

        ' *PDF to DATA*//  
        Using converter As New PdfToDataConverter()
            ' Add files to convert. 
            converter.AddSourceFile(pdfPath)

            'Set the preferred conversion properties 

            'Convert to CSV
            'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Text;
            'converter.DelimiterOption = DelimiterOptions.Comma;

            'Convert to simple unformatted Excel
            converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.Excel

            'Convert to MySQL
            'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MySQL;

            'Convert to Microsoft SQL
            'converter.ExportFormat = SolidFramework.Converters.Plumbing.DataExportFormat.MSSQL;

            ' Set a specific page range to convert
            converter.PageRange = New SolidFramework.PageRange(New Integer() {1})

            ' Turn on Solid Documents Optical Character Recognition (OCR) for Scanned Files
            converter.TextRecoveryEngine = TextRecoveryEngine.SolidOCR
            'Only with Pro+OCR and OCR license
            converter.TextRecoveryType = TextRecovery.Automatic

            ' Show the status of the PDF file in the Console Window
            Dim status As SolidFramework.Converters.Plumbing.ConversionStatus = converter.ConvertTo(outputPath, True)
            If status <> ConversionStatus.Success Then
                Console.WriteLine(status)
            End If
        End Using

        Console.ReadKey()
    End Sub

End Module