Find Text in PDF | Powershell

ByteScout PDF Extractor SDK

Free Trial Web API version Licensing Request A Quote

HAVE QUESTIONS OR NEED HELP? SUBMIT THE SUPPORT REQUEST FORM or write email to SUPPORT@BYTESCOUT.COM

FindText.bat:

@echo off

if "%~1"=="" (
	echo -----------------------------------------------------
	echo Invalid parameter!
	echo -----------------------------------------------------
	echo Usage: FindText.bat file_name
	echo Example: FindText.bat "sample1.pdf"
	echo -----------------------------------------------------
	if not "%NOPAUSE%"=="1" pause
	exit /b 1
)

powershell -NoProfile -ExecutionPolicy Bypass -Command "& .\FindText.ps1" "%1"
echo Script finished with errorlevel=%errorlevel%

pause

FindText.ps1:

Copy

#*******************************************************************************************#
#                                                                                           #
# Download Free Evaluation Version From: https://bytescout.com/download/web-installer       #
#                                                                                           #
# Also available as Web API! Get Your Free API Key: https://app.pdf.co/signup               #
#                                                                                           #
# Copyright © 2017-2020 ByteScout, Inc. All rights reserved.                                #
# https://www.bytescout.com                                                                 #
# https://pdf.co                                                                            #
#                                                                                           #
#*******************************************************************************************#

# Note: if you are looking for a more higher level API to extract data from invoices, reports, statements
# then please check Document Parser SDK and Web API at https://bytescout.com/products/developer/documentparsersdk/index.html
# and https://pdf.co for secure and scalable web api

Param (
    [Parameter(Mandatory = $true)]
    [string] $InputFileName = ""
)

#Add reference to Bytescout.PDFExtractor.dll assembly
Add-Type -Path "C:\Program Files\Bytescout PDF Extractor SDK\net4.00\Bytescout.PDFExtractor.dll"

# Check input file exists
if ((Test-Path $InputFileName) -eq $false) {

    Write-Host "Input file does not exist." -ForegroundColor Red
    Exit 0
}

# Create and activate Bytescout.PDFExtractor.TextExtractor instance
$Extractor = New-Object Bytescout.PDFExtractor.TextExtractor
$Extractor.RegistrationName = "demo"
$Extractor.RegistrationKey = "demo"

try {
    # Load sample PDF document
    $Extractor.LoadDocumentFromFile($InputFileName)

    # Get page count
    $PageCount = $Extractor.GetPageCount()
    Write-Host "Pages: $PageCount"

    # Set the matching mode
    # 0 = WordMatchingMode.None - treats the search string as substring
    # 1 = WordMatchingMode.ExactMatch - treats the search string as separate word
    # 2 = WordMatchingMode.SmartMatch - will find the word in various forms (like Adobe Reader).
    $Extractor.WordMatchingMode = 1

    # Iterate through pages
    for ($i = 0; $i -lt $PageCount; $i++) {

        # Search each page for "ipsum" string
        if ($Extractor.Find($i, "ipsum", $false) -eq $true) {

            do {
        
                $PageNo = $i + 1
                Write-Host "Found on page $PageNo"
                # Iterate through each element in the found text
                foreach ($Element in $Extractor.FoundText.Elements) {
                
                    Write-Host "Element " $Element.Index
                    Write-Host "Location: " $Element.Left ", " $Element.Top ", " $Element.Width ", " $Element.Height 
                    Write-Host "Text: " $Element.Text
                    Write-Host "Font is bold: " $Element.FontIsBold
                    Write-Host "Font is italic: " $Element.FontIsItalic
                    Write-Host "Font name: " $Element.FontName
                    Write-Host "Font size: " $Element.FontSize
                    Write-Host "Font color: " $Element.FontColor
                }
            } while ($Extractor.FindNext() -eq $true)
        } else {

            Write-Host "Nothing found"
        }
    }
} catch {

    Write-Host $_.Exception.Message
}

$Extractor.Dispose()