Skip to content

Commit 39ddc50

Browse files
committed
rename file
1 parent d2c4e86 commit 39ddc50

File tree

2 files changed

+17
-17
lines changed

2 files changed

+17
-17
lines changed

ocr/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ The OCR functionality works by sending images to a configured HTTP endpoint that
99
- [hocr_sample.go](hocr_sample.go) illustrates how to process HOCR formatted OCR output, parsing word-level information including bounding boxes and confidence scores.
1010
- [ocr_batch.go](ocr_batch.go) shows how to perform batch OCR processing on multiple images concurrently, with error handling and summary reporting.
1111
- [ocr_sample.go](ocr_sample.go) demonstrates basic OCR usage by sending a single image to an HTTP OCR service and extracting the text content.
12-
- [reconstruct_pdf_from hocr.go](reconstruct_pdf_from%20hocr.go) demonstrates a complete workflow to extract images from a PDF, perform OCR with HOCR output, parse the structured results, and reconstruct a searchable PDF with properly positioned text.
12+
- [reconstruct_pdf_from_hocr.go](reconstruct_pdf_from_hocr.go) demonstrates a complete workflow to extract images from a PDF, perform OCR with HOCR output, parse the structured results, and reconstruct a searchable PDF with properly positioned text.
1313

1414
## Requirements
1515

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
/**
22
* This is a sample Go program that demonstrates how to use the UniPDF library
3-
* to extrtact text from within images in a PDF using an OCR service that returns
3+
* to extract text from within images in a PDF using an OCR service that returns
44
* HOCR formatted output then writes the reconstructed text to a new PDF.
55
*
6-
* Run as: go run reconstruct_pdf.go input.pdf
6+
* Run as: go run reconstruct_pdf_from_hocr.go input.pdf
77
*/
88
package main
99

@@ -118,7 +118,7 @@ func ParseTitleAttributes(title string) *TitleAttributes {
118118
xDescendersRe := regexp.MustCompile(`x_descenders\s+([\d.]+)`)
119119
xAscendersRe := regexp.MustCompile(`x_ascenders\s+([\d.]+)`)
120120
xWConfRe := regexp.MustCompile(`x_wconf\s+(\d+)`)
121-
ppagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`)
121+
pagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`)
122122
imageRe := regexp.MustCompile(`image\s+"([^"]*)"`)
123123

124124
// Parse bbox
@@ -158,7 +158,7 @@ func ParseTitleAttributes(title string) *TitleAttributes {
158158
}
159159

160160
// Parse ppageno
161-
if matches := ppagenoRe.FindStringSubmatch(title); matches != nil {
161+
if matches := pagenoRe.FindStringSubmatch(title); matches != nil {
162162
attrs.PageNo, _ = strconv.Atoi(matches[1])
163163
}
164164

@@ -261,23 +261,23 @@ func init() {
261261

262262
func main() {
263263
if len(os.Args) < 2 {
264-
fmt.Printf("Usage: go run reconstruct_pdf.go input.pdf\n")
264+
fmt.Printf("Usage: go run reconstruct_pdf_from_hocr.go input.pdf\n")
265265
os.Exit(1)
266266
}
267267

268268
// Load images from the PDF.
269269
images, err := loadImages(os.Args[1])
270270
if err != nil {
271-
fmt.Printf("Error loading images: %s", err)
272-
return
271+
fmt.Printf("Error loading images: %v\n", err)
272+
os.Exit(1)
273273
}
274274

275275
outDir := "output"
276276
if _, err := os.Stat(outDir); os.IsNotExist(err) {
277277
err := os.Mkdir(outDir, 0755)
278278
if err != nil {
279-
fmt.Printf("Error creating output directory: %s", err)
280-
return
279+
fmt.Printf("Error creating output directory: %v\n", err)
280+
os.Exit(1)
281281
}
282282
}
283283

@@ -384,24 +384,24 @@ func processImage(img image.Image) (*OCRPage, error) {
384384

385385
result, err := client.ExtractText(context.Background(), imgReader, "image.jpg")
386386
if err != nil {
387-
fmt.Printf("Error extracting text: %s", err)
388-
return nil, err
387+
return nil, fmt.Errorf("error extracting text: %w", err)
389388
}
390389

391390
// Parse JSON response to extract the "result" field.
392391
var jsonObj map[string]interface{}
393392
if err := json.Unmarshal(result, &jsonObj); err != nil {
394-
fmt.Printf("Error parsing JSON response: %s", err)
395-
return nil, err
393+
return nil, fmt.Errorf("error parsing JSON response: %w", err)
396394
}
397395

398-
content := jsonObj["result"].(string)
396+
content, ok := jsonObj["result"].(string)
397+
if !ok {
398+
return nil, fmt.Errorf("result field is not a string")
399+
}
399400

400401
// Parse hOCR HTML content
401402
var ocrPage OCRPage
402403
if err := xml.Unmarshal([]byte(content), &ocrPage); err != nil {
403-
fmt.Printf("Error unmarshalling HOCR data: %s\n", err)
404-
return nil, err
404+
return nil, fmt.Errorf("error unmarshalling HOCR data: %w", err)
405405
}
406406

407407
return &ocrPage, nil

0 commit comments

Comments
 (0)