Free Trial
Web API version
Licensing
Request A Quote
HAVE QUESTIONS OR NEED HELP? SUBMIT THE SUPPORT REQUEST FORM or write email to SUPPORT@BYTESCOUT.COM
Parse with OCR | VB.NET
Module1.vb:
VB
Imports ByteScout.DocumentParser ' This example demonstrates parsing of scanned documents ' using the Optical Character Recognition (OCR). Module Module1 Sub Main() Dim template As String = ".\DigitalOcean.yml" Dim inputDocument As String = ".\DigitalOcean-scanned.jpg" ' Create and activate DocumentParser instance Using documentParser As New DocumentParser("demo", "demo") ' Enable Optical Character Recognition (OCR) in Auto mode ' (DocumentParser automatically detects if OCR Is required). documentParser.OCRMode = OCRMode.Auto ' Set PDF document rendering resolution documentParser.OCRResolution = 300 ' Set the location of OCR language data files documentParser.OCRLanguageDataFolder = "c:\Program Files\ByteScout Document Parser SDK\ocrdata" ' Set OCR language ' "eng" for english, "deu" for German, "fra" for French, etc. - according to files in "ocrdata" folder documentParser.OCRLanguage = "eng" ' Find more language files at https://github.com/bytescout/ocrdata ' Note: The OCRLanguage can be overridden in a template. ' See the Template Creation Guide. ' You can also apply various preprocessing filters ' to improve the recognition on low-quality scans. ' Automatically deskew skewed scans 'documentParser.OCRImagePreprocessingFilters.AddDeskew() ' Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) 'documentParser.OCRImagePreprocessingFilters.AddVerticalLinesRemover() 'documentParser.OCRImagePreprocessingFilters.AddHorizontalLinesRemover() ' Repair broken letters 'documentParser.OCRImagePreprocessingFilters.AddDilate() ' Remove noise 'documentParser.OCRImagePreprocessingFilters.AddMedian() ' Apply Gamma Correction 'documentParser.OCRImagePreprocessingFilters.AddGammaCorrection(1.4) ' Add Contrast 'documentParser.OCRImagePreprocessingFilters.AddContrast(20) ' Load template documentParser.AddTemplate(template) Console.WriteLine("Template loaded.") Console.WriteLine() Console.WriteLine($"Parsing ""{inputDocument}"" with OCR...") Console.WriteLine() ' Parse document data to JSON format Dim jsonString As String = documentParser.ParseDocument(inputDocument, OutputFormat.JSON) ' Display parsed data in console Console.WriteLine("Parsed data in JSON format:") Console.WriteLine() Console.WriteLine(jsonString) End Using Console.WriteLine() Console.WriteLine("Press any key to continue...") Console.ReadLine() End Sub End Module
DigitalOcean.yml:
templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- DigitalOcean
- 101 Avenue of the Americas
- Invoice Number
objects:
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: DigitalOcean
regex: true
- name: invoiceId
objectType: field
fieldProperties:
fieldType: macros
expression: 'Invoice Number: ({{Digits}})'
regex: true
- name: dateIssued
objectType: field
fieldProperties:
fieldType: macros
expression: 'Date Issued: ({{SmartDate}})'
regex: true
dataType: date
dateFormat: auto-mdy
- name: total
objectType: field
fieldProperties:
fieldType: macros
expression: 'Total: ({{Money}})'
regex: true
dataType: decimal
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: Description{{Spaces}}Hours
regex: true
end:
expression: 'Total:'
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
regex: true
columns:
- name: hours
dataType: integer
- name: unitPrice
dataType: decimal