Parse with OCR | VB.NETByteScout Document Parser SDK

Parse with OCR | VB.NET

Module1.vb:

VB
Imports ByteScout.DocumentParser

' This example demonstrates parsing of scanned documents
' using the Optical Character Recognition (OCR).

Module Module1

    Sub Main()

        Dim template As String = ".\DigitalOcean.yml"
        Dim inputDocument As String = ".\DigitalOcean-scanned.jpg"

        ' Create and activate DocumentParser instance
        Using documentParser As New DocumentParser("demo", "demo")

            ' Enable Optical Character Recognition (OCR) in Auto mode
            ' (DocumentParser automatically detects if OCR Is required).
            documentParser.OCRMode = OCRMode.Auto

            ' Set PDF document rendering resolution
            documentParser.OCRResolution = 300

            ' Set the location of OCR language data files
            documentParser.OCRLanguageDataFolder = "c:\Program Files\ByteScout Document Parser SDK\ocrdata"

            ' Set OCR language
            ' "eng" for english, "deu" for German, "fra" for French, etc. - according to files in "ocrdata" folder
            documentParser.OCRLanguage = "eng"
            ' Find more language files at https://github.com/bytescout/ocrdata

            ' Note: The OCRLanguage can be overridden in a template. 
            ' See the Template Creation Guide.


            ' You can also apply various preprocessing filters
            ' to improve the recognition on low-quality scans.

            ' Automatically deskew skewed scans
            'documentParser.OCRImagePreprocessingFilters.AddDeskew()

            ' Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            'documentParser.OCRImagePreprocessingFilters.AddVerticalLinesRemover()
            'documentParser.OCRImagePreprocessingFilters.AddHorizontalLinesRemover()

            ' Repair broken letters
            'documentParser.OCRImagePreprocessingFilters.AddDilate()

            ' Remove noise
            'documentParser.OCRImagePreprocessingFilters.AddMedian()

            ' Apply Gamma Correction
            'documentParser.OCRImagePreprocessingFilters.AddGammaCorrection(1.4)

            ' Add Contrast
            'documentParser.OCRImagePreprocessingFilters.AddContrast(20)

            ' Load template
            documentParser.AddTemplate(template)

            Console.WriteLine("Template loaded.")
            Console.WriteLine()

            Console.WriteLine($"Parsing ""{inputDocument}"" with OCR...")
            Console.WriteLine()


            ' Parse document data to JSON format
            Dim jsonString As String = documentParser.ParseDocument(inputDocument, OutputFormat.JSON)

            ' Display parsed data in console
            Console.WriteLine("Parsed data in JSON format:")
            Console.WriteLine()
            Console.WriteLine(jsonString)

        End Using

        Console.WriteLine()
        Console.WriteLine("Press any key to continue...")
        Console.ReadLine()

    End Sub

End Module

DigitalOcean.yml:

templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
  keywords:
  - DigitalOcean
  - 101 Avenue of the Americas
  - Invoice Number
objects:
- name: companyName
  objectType: field
  fieldProperties:
    fieldType: static
    expression: DigitalOcean
    regex: true
- name: invoiceId
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Invoice Number: ({{Digits}})'
    regex: true
- name: dateIssued
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Date Issued: ({{SmartDate}})'
    regex: true
    dataType: date
    dateFormat: auto-mdy
- name: total
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Total: ({{Money}})'
    regex: true
    dataType: decimal
- name: currency
  objectType: field
  fieldProperties:
    fieldType: static
    expression: USD
    regex: true
- name: table1
  objectType: table
  tableProperties:
    start:
      expression: Description{{Spaces}}Hours
      regex: true
    end:
      expression: 'Total:'
      regex: true
    row:
      expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
      regex: true
    columns:
    - name: hours
      dataType: integer
    - name: unitPrice
      dataType: decimal