Parse with OCR | C#ByteScout Document Parser SDK

Parse with OCR | C#

Program.cs:

C#
using System;
using ByteScout.DocumentParser;

// This example demonstrates parsing of scanned documents
// using the Optical Character Recognition (OCR).

namespace GeneralExample
{
    class Program
    {
        static void Main(string[] args)
        {
            string template = @".\DigitalOcean.yml";
            string inputDocument = @".\DigitalOcean-scanned.jpg";

            // Create and activate DocumentParser instance
            using (DocumentParser documentParser = new DocumentParser("demo", "demo"))
            {
                // Enable Optical Character Recognition (OCR) in Auto mode
                // (DocumentParser automatically detects if OCR is required).
                documentParser.OCRMode = OCRMode.Auto;

                // Set PDF document rendering resolution
                documentParser.OCRResolution = 300;

                // Set the location of OCR language data files
                documentParser.OCRLanguageDataFolder = @"c:\Program Files\ByteScout Document Parser SDK\ocrdata";

                // Set OCR language
                // "eng" for english, "deu" for German, "fra" for French, etc. - according to files in "ocrdata" folder
                documentParser.OCRLanguage = "eng";
                // Find more language files at https://github.com/bytescout/ocrdata

                // Note: The OCRLanguage can be overridden in a template. 
                // See the Template Creation Guide.



                // You can also apply various preprocessing filters
                // to improve the recognition on low-quality scans.

                // Automatically deskew skewed scans
                //documentParser.OCRImagePreprocessingFilters.AddDeskew();

                // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
                //documentParser.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
                //documentParser.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

                // Repair broken letters
                //documentParser.OCRImagePreprocessingFilters.AddDilate();

                // Remove noise
                //documentParser.OCRImagePreprocessingFilters.AddMedian();

                // Apply Gamma Correction
                //documentParser.OCRImagePreprocessingFilters.AddGammaCorrection(1.4);

                // Add Contrast
                //documentParser.OCRImagePreprocessingFilters.AddContrast(20);


                // Load template
                documentParser.AddTemplate(template);

                Console.WriteLine("Template loaded.");
                Console.WriteLine();

                Console.WriteLine($"Parsing \"{inputDocument}\" with OCR...");
                Console.WriteLine();

                // Parse document data to JSON format
                string jsonString = documentParser.ParseDocument(inputDocument, OutputFormat.JSON);

                // Display parsed data in console
                Console.WriteLine("Parsed data in JSON format:");
                Console.WriteLine();
                Console.WriteLine(jsonString);
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
    }
}

DigitalOcean.yml:

templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
  keywords:
  - DigitalOcean
  - 101 Avenue of the Americas
  - Invoice Number
objects:
- name: companyName
  objectType: field
  fieldProperties:
    fieldType: static
    expression: DigitalOcean
    regex: true
- name: invoiceId
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Invoice Number: ({{Digits}})'
    regex: true
- name: dateIssued
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Date Issued: ({{SmartDate}})'
    regex: true
    dataType: date
    dateFormat: auto-mdy
- name: total
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Total: ({{Money}})'
    regex: true
    dataType: decimal
- name: currency
  objectType: field
  fieldProperties:
    fieldType: static
    expression: USD
    regex: true
- name: table1
  objectType: table
  tableProperties:
    start:
      expression: Description{{Spaces}}Hours
      regex: true
    end:
      expression: 'Total:'
      regex: true
    row:
      expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
      regex: true
    columns:
    - name: hours
      dataType: integer
    - name: unitPrice
      dataType: decimal