rename file

3ace · 3ace · commit 39ddc50a48b4 · 2025-11-20T16:53:38.000+07:00
diff --git a/ocr/README.md b/ocr/README.md
@@ -9,7 +9,7 @@ The OCR functionality works by sending images to a configured HTTP endpoint that
 - [hocr_sample.go](hocr_sample.go) illustrates how to process HOCR formatted OCR output, parsing word-level information including bounding boxes and confidence scores.
 - [ocr_batch.go](ocr_batch.go) shows how to perform batch OCR processing on multiple images concurrently, with error handling and summary reporting.
 - [ocr_sample.go](ocr_sample.go) demonstrates basic OCR usage by sending a single image to an HTTP OCR service and extracting the text content.
-- [reconstruct_pdf_from hocr.go](reconstruct_pdf_from%20hocr.go) demonstrates a complete workflow to extract images from a PDF, perform OCR with HOCR output, parse the structured results, and reconstruct a searchable PDF with properly positioned text.
+- [reconstruct_pdf_from_hocr.go](reconstruct_pdf_from_hocr.go) demonstrates a complete workflow to extract images from a PDF, perform OCR with HOCR output, parse the structured results, and reconstruct a searchable PDF with properly positioned text.
 
 ## Requirements
 
diff --git a/ocr/reconstruct_pdf_from_hocr.go b/ocr/reconstruct_pdf_from_hocr.go
@@ -1,9 +1,9 @@
 /**
  * This is a sample Go program that demonstrates how to use the UniPDF library
- * to extrtact text from within images in a PDF using an OCR service that returns
+ * to extract text from within images in a PDF using an OCR service that returns
  * HOCR formatted output then writes the reconstructed text to a new PDF.
  *
- * Run as: go run reconstruct_pdf.go input.pdf
+ * Run as: go run reconstruct_pdf_from_hocr.go input.pdf
  */
 package main
 
@@ -118,7 +118,7 @@ func ParseTitleAttributes(title string) *TitleAttributes {
 	xDescendersRe := regexp.MustCompile(`x_descenders\s+([\d.]+)`)
 	xAscendersRe := regexp.MustCompile(`x_ascenders\s+([\d.]+)`)
 	xWConfRe := regexp.MustCompile(`x_wconf\s+(\d+)`)
-	ppagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`)
+	pagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`)
 	imageRe := regexp.MustCompile(`image\s+"([^"]*)"`)
 
 	// Parse bbox
@@ -158,7 +158,7 @@ func ParseTitleAttributes(title string) *TitleAttributes {
 	}
 
 	// Parse ppageno
-	if matches := ppagenoRe.FindStringSubmatch(title); matches != nil {
+	if matches := pagenoRe.FindStringSubmatch(title); matches != nil {
 		attrs.PageNo, _ = strconv.Atoi(matches[1])
 	}
 
@@ -261,23 +261,23 @@ func init() {
 
 func main() {
 	if len(os.Args) < 2 {
-		fmt.Printf("Usage: go run reconstruct_pdf.go input.pdf\n")
+		fmt.Printf("Usage: go run reconstruct_pdf_from_hocr.go input.pdf\n")
 		os.Exit(1)
 	}
 
 	// Load images from the PDF.
 	images, err := loadImages(os.Args[1])
 	if err != nil {
-		fmt.Printf("Error loading images: %s", err)
-		return
+		fmt.Printf("Error loading images: %v\n", err)
+		os.Exit(1)
 	}
 
 	outDir := "output"
 	if _, err := os.Stat(outDir); os.IsNotExist(err) {
 		err := os.Mkdir(outDir, 0755)
 		if err != nil {
-			fmt.Printf("Error creating output directory: %s", err)
-			return
+			fmt.Printf("Error creating output directory: %v\n", err)
+			os.Exit(1)
 		}
 	}
 
@@ -384,24 +384,24 @@ func processImage(img image.Image) (*OCRPage, error) {
 
 	result, err := client.ExtractText(context.Background(), imgReader, "image.jpg")
 	if err != nil {
-		fmt.Printf("Error extracting text: %s", err)
-		return nil, err
+		return nil, fmt.Errorf("error extracting text: %w", err)
 	}
 
 	// Parse JSON response to extract the "result" field.
 	var jsonObj map[string]interface{}
 	if err := json.Unmarshal(result, &jsonObj); err != nil {
-		fmt.Printf("Error parsing JSON response: %s", err)
-		return nil, err
+		return nil, fmt.Errorf("error parsing JSON response: %w", err)
 	}
 
-	content := jsonObj["result"].(string)
+	content, ok := jsonObj["result"].(string)
+	if !ok {
+		return nil, fmt.Errorf("result field is not a string")
+	}
 
 	// Parse hOCR HTML content
 	var ocrPage OCRPage
 	if err := xml.Unmarshal([]byte(content), &ocrPage); err != nil {
-		fmt.Printf("Error unmarshalling HOCR data: %s\n", err)
-		return nil, err
+		return nil, fmt.Errorf("error unmarshalling HOCR data: %w", err)
 	}
 
 	return &ocrPage, nil