Извлечение текста из PDF-документов с использованием PDFium VCL в Delphi.

Извлечение текста - одна из самых распространенных задач обработки PDF-файлов. Независимо от того, создаете ли вы поисковую систему документов, приложение для анализа данных или систему управления контентом, возможность извлечения текста из PDF-файлов является необходимой. Этот учебник охватывает Извлечение текста демонстрацию, которая показывает, как извлекать текстовое содержимое из PDF-документов с помощью PDFium VCL.

Обзор

Демонстрация "Извлечение текста" показывает, как извлечь все текстовое содержимое из PDF-документа и сохранить его в текстовый файл. Она поддерживает выбор диапазона страниц, сохранение абзацев и корректную обработку специальных символов.

Основные функции

Полное извлечение документа – Извлечение текста со всех страниц одновременно
Выбор диапазона страниц – Извлечение текста только с определенных страниц
Обнаружение абзацев Сохранение структуры абзацев на основе позиций символов.
Обработка специальных символов. Возможность удаления символов NUL из выходных данных.
Разделители страниц. Необязательные пустые строки между страницами.
Отслеживание прогресса. Визуальная индикация прогресса и подробное ведение журнала.
Вывод в кодировке UTF-8. Правильная кодировка текстового вывода для международных документов.
Доступ на уровне символов – Доступ к отдельным символам для расширенной обработки.

Требования к библиотеке PDFium DLL

Перед запуском любого приложения PDFium VCL, убедитесь, что файлы PDFium DLL установлены:

pdfium32.dll / pdfium64.dll – Стандартные версии (около 5-6 МБ)
pdfium32v8.dll / pdfium64v8.dll – С JavaScript-движком V8 (около 23-27 МБ)

Установка: Запуск PDFiumVCL\DLLs\CopyDlls.bat от имени администратора для автоматической копии DLL-файлов в системные каталоги Windows.

Базовая извлечение текста.

Самый простой способ извлечения текста со страницы PDF:

procedure ExtractSimpleText;

var

Pdf: TPdf;

PageText: string;

begin

Pdf := TPdf.Create(nil);

try

Pdf.FileName := 'document.pdf';

Pdf.Active := True;

// Extract text from page 1

Pdf.PageNumber := 1;

PageText := Pdf.Text;

// Use the extracted text

Memo1.Lines.Text := PageText;

finally

Pdf.Active := False;

Pdf.Free;

end;

Извлечение из всех страниц.

Перебор всех страниц для извлечения полного текста документа:

procedure TFormMain.ButtonExtractClick(Sender: TObject);

var

I, StartPage, EndPage: Integer;

PageText: string;

FileStream: TFileStream;

Text: UTF8String;

begin

Pdf.FileName := EditPdfFile.Text;

Pdf.PageNumber := 0;

Pdf.Active := True;

try

// Determine page range

if RadioButtonAllPages.Checked then

begin

StartPage := 1;

EndPage := Pdf.PageCount;

end

else

begin

StartPage := StrToInt(EditFromPage.Text);

EndPage := StrToInt(EditToPage.Text);

end;

// Create output file

FileStream := TFileStream.Create(EditOutputFile.Text, fmCreate);

try

for I := StartPage to EndPage do

begin

Pdf.PageNumber := I;

PageText := Pdf.Text;

// Convert to UTF-8 and write

Text := UTF8Encode(PageText);

if Length(Text) > 0 then

FileStream.WriteBuffer(Text[1], Length(Text));

// Add page separator if enabled

if CheckBoxPageSeparator.Checked and (I < EndPage) then

begin

Text := UTF8Encode(#13#10#13#10#13#10);

FileStream.WriteBuffer(Text[1], Length(Text));

end;

ProgressBar.Position := I - StartPage + 1;

Application.ProcessMessages;

end;

finally

FileStream.Free;

end;

finally

Pdf.Active := False;

end;

Извлечение текста с сохранением структуры абзацев.

Для документов, где важна структура абзацев, используйте анализ положения символов:

function ExtractTextWithParagraphs(Pdf: TPdf): string;

var

CharIndex: Integer;

CurrentChar: WideChar;

CurrentY, PrevY: Double;

LineHeight, YGap: Double;

ResultText, LineBuffer: string;

MinLineHeight: Double;

begin

ResultText := '';

LineBuffer := '';

PrevY := -1;

MinLineHeight := 999999;

// First pass: determine typical line height

for CharIndex := 0 to Pdf.CharacterCount - 1 do

begin

CurrentY := Pdf.CharacterOrigin[CharIndex].Y;

if PrevY >= 0 then

begin

YGap := Abs(CurrentY - PrevY);

if (YGap > 0) and (YGap < MinLineHeight) then

MinLineHeight := YGap;

end;

PrevY := CurrentY;

end;

LineHeight := MinLineHeight;

if LineHeight <= 0 then

LineHeight := 12; // Default fallback

// Second pass: build text with paragraph detection

PrevY := -1;

for CharIndex := 0 to Pdf.CharacterCount - 1 do

begin

CurrentChar := Pdf.Character[CharIndex];

CurrentY := Pdf.CharacterOrigin[CharIndex].Y;

// Skip NUL characters

if Ord(CurrentChar) = 0 then

Continue;

// Check for line break based on Y position change

if PrevY >= 0 then

begin

YGap := Abs(CurrentY - PrevY);

if YGap > LineHeight * 1.2 then

begin

// Add current line to result

if LineBuffer <> '' then

begin

ResultText := ResultText + LineBuffer + #13#10;

LineBuffer := '';

end;

// Check if this is a paragraph break (larger gap)

if YGap > LineHeight * 2.5 then

ResultText := ResultText + #13#10; // Extra line for paragraph

end;

LineBuffer := LineBuffer + CurrentChar;

PrevY := CurrentY;

end;

// Add final line

if LineBuffer <> '' then

ResultText := ResultText + LineBuffer;

Result := ResultText;

end;

Очистка извлеченного текста.

Удалите символы NUL и нормализуйте текст:

function CleanAndFormatText(const RawText: string): UTF8String;

var

I: Integer;

CleanText: string;

begin

CleanText := '';

for I := 1 to Length(RawText) do

begin

// Skip NUL characters but keep all other characters

if Ord(RawText[I]) <> 0 then

CleanText := CleanText + RawText[I];

end;

Result := UTF8Encode(CleanText);

end;

Извлечение текста из определенной области.

Извлеките текст из прямоугольной области страницы:

procedure ExtractTextFromRegion;

var

Pdf: TPdf;

RegionText: string;

begin

Pdf := TPdf.Create(nil);

try

Pdf.FileName := 'document.pdf';

Pdf.Active := True;

Pdf.PageNumber := 1;

// Extract text from specific rectangle

// Parameters: Left, Top, Right, Bottom (in PDF coordinates)

RegionText := Pdf.TextInRectangle(100, 700, 500, 600);

ShowMessage('Text in region: ' + RegionText);

finally

Pdf.Active := False;

Pdf.Free;

end;

Доступ на уровне символов

Для точного анализа текста, получайте доступ к отдельным символам:

procedure AnalyzeCharacters;

var

Pdf: TPdf;

I: Integer;

Char: WideChar;

Origin: TPdfPoint;

Rect: TPdfRectangle;

FontSize: Double;

begin

Pdf := TPdf.Create(nil);

try

Pdf.FileName := 'document.pdf';

Pdf.Active := True;

Pdf.PageNumber := 1;

// Access each character

for I := 0 to Pdf.CharacterCount - 1 do

begin

Char := Pdf.Character[I];

Origin := Pdf.CharacterOrigin[I];

Rect := Pdf.CharacterRectangle[I];

FontSize := Pdf.FontSize[I];

// Check character properties

if Pdf.CharacterGenerated[I] then

// Character was generated (e.g., hyphenation)

Continue;

if Pdf.CharacterMapError[I] then

// Character couldn't be mapped to Unicode

Continue;

// Process character with position and size info

Memo1.Lines.Add(Format('Char: %s at (%.2f, %.2f) size: %.2f',

[Char, Origin.X, Origin.Y, FontSize]));

end;

finally

Pdf.Active := False;

Pdf.Free;

end;

Поиск символа по координатам на экране

Полезно для выделения текста и взаимодействия:

function GetCharacterAtPosition(Pdf: TPdf; X, Y: Double): Integer;

begin

// Get character index at position with tolerance

Result := Pdf.CharacterIndexAtPos(X, Y, 5.0, 5.0);

end;

Обработка ошибок и граничных случаев

procedure TFormMain.SafeExtractText;

begin

try

Pdf.FileName := EditPdfFile.Text;

Pdf.PageNumber := 0;

Pdf.Active := True;

except

on E: Exception do

begin

LogMessage('Failed to load PDF: ' + E.Message);

Exit;

end;

try

for I := StartPage to EndPage do

begin

try

Pdf.PageNumber := I;

PageText := Pdf.Text;

// Process text...

except

on E: Exception do

begin

// Log error but continue with next page

LogMessage('Error on page ' + IntToStr(I) + ': ' + E.Message);

end;

finally

Pdf.Active := False;

end;

Особенности производительности

Извлекайте текст страницу за страницей, а не загружайте все в память.
Используйте потоковую запись файлов для больших документов.
Вызов Application.ProcessMessages в циклах для обеспечения отзывчивости пользовательского интерфейса.
Рассмотрите возможность пакетной обработки для нескольких документов.

Заключение.

Демонстрация извлечения текста показывает, как PDFium VCL делает извлечение текста простым и надежным. Независимо от того, требуется ли вам простое извлечение текста или сложная обработка с учетом абзацев, компонент предоставляет все необходимые инструменты.

Доступ на уровне символов позволяет выполнять сложный анализ текста, в то время как простой Text property позволяет обрабатывать большинство распространенных сценариев использования всего одной строкой кода.

Начните создавать ваше решение для извлечения текста с помощью. Компонент PDFium VCL today.