|
1 | 1 | /** |
2 | 2 | * This is a sample Go program that demonstrates how to use the UniPDF library |
3 | | - * to extrtact text from within images in a PDF using an OCR service that returns |
| 3 | + * to extract text from within images in a PDF using an OCR service that returns |
4 | 4 | * HOCR formatted output then writes the reconstructed text to a new PDF. |
5 | 5 | * |
6 | | - * Run as: go run reconstruct_pdf.go input.pdf |
| 6 | + * Run as: go run reconstruct_pdf_from_hocr.go input.pdf |
7 | 7 | */ |
8 | 8 | package main |
9 | 9 |
|
@@ -118,7 +118,7 @@ func ParseTitleAttributes(title string) *TitleAttributes { |
118 | 118 | xDescendersRe := regexp.MustCompile(`x_descenders\s+([\d.]+)`) |
119 | 119 | xAscendersRe := regexp.MustCompile(`x_ascenders\s+([\d.]+)`) |
120 | 120 | xWConfRe := regexp.MustCompile(`x_wconf\s+(\d+)`) |
121 | | - ppagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`) |
| 121 | + pagenoRe := regexp.MustCompile(`ppageno\s+(\d+)`) |
122 | 122 | imageRe := regexp.MustCompile(`image\s+"([^"]*)"`) |
123 | 123 |
|
124 | 124 | // Parse bbox |
@@ -158,7 +158,7 @@ func ParseTitleAttributes(title string) *TitleAttributes { |
158 | 158 | } |
159 | 159 |
|
160 | 160 | // Parse ppageno |
161 | | - if matches := ppagenoRe.FindStringSubmatch(title); matches != nil { |
| 161 | + if matches := pagenoRe.FindStringSubmatch(title); matches != nil { |
162 | 162 | attrs.PageNo, _ = strconv.Atoi(matches[1]) |
163 | 163 | } |
164 | 164 |
|
@@ -261,23 +261,23 @@ func init() { |
261 | 261 |
|
262 | 262 | func main() { |
263 | 263 | if len(os.Args) < 2 { |
264 | | - fmt.Printf("Usage: go run reconstruct_pdf.go input.pdf\n") |
| 264 | + fmt.Printf("Usage: go run reconstruct_pdf_from_hocr.go input.pdf\n") |
265 | 265 | os.Exit(1) |
266 | 266 | } |
267 | 267 |
|
268 | 268 | // Load images from the PDF. |
269 | 269 | images, err := loadImages(os.Args[1]) |
270 | 270 | if err != nil { |
271 | | - fmt.Printf("Error loading images: %s", err) |
272 | | - return |
| 271 | + fmt.Printf("Error loading images: %v\n", err) |
| 272 | + os.Exit(1) |
273 | 273 | } |
274 | 274 |
|
275 | 275 | outDir := "output" |
276 | 276 | if _, err := os.Stat(outDir); os.IsNotExist(err) { |
277 | 277 | err := os.Mkdir(outDir, 0755) |
278 | 278 | if err != nil { |
279 | | - fmt.Printf("Error creating output directory: %s", err) |
280 | | - return |
| 279 | + fmt.Printf("Error creating output directory: %v\n", err) |
| 280 | + os.Exit(1) |
281 | 281 | } |
282 | 282 | } |
283 | 283 |
|
@@ -384,24 +384,24 @@ func processImage(img image.Image) (*OCRPage, error) { |
384 | 384 |
|
385 | 385 | result, err := client.ExtractText(context.Background(), imgReader, "image.jpg") |
386 | 386 | if err != nil { |
387 | | - fmt.Printf("Error extracting text: %s", err) |
388 | | - return nil, err |
| 387 | + return nil, fmt.Errorf("error extracting text: %w", err) |
389 | 388 | } |
390 | 389 |
|
391 | 390 | // Parse JSON response to extract the "result" field. |
392 | 391 | var jsonObj map[string]interface{} |
393 | 392 | if err := json.Unmarshal(result, &jsonObj); err != nil { |
394 | | - fmt.Printf("Error parsing JSON response: %s", err) |
395 | | - return nil, err |
| 393 | + return nil, fmt.Errorf("error parsing JSON response: %w", err) |
396 | 394 | } |
397 | 395 |
|
398 | | - content := jsonObj["result"].(string) |
| 396 | + content, ok := jsonObj["result"].(string) |
| 397 | + if !ok { |
| 398 | + return nil, fmt.Errorf("result field is not a string") |
| 399 | + } |
399 | 400 |
|
400 | 401 | // Parse hOCR HTML content |
401 | 402 | var ocrPage OCRPage |
402 | 403 | if err := xml.Unmarshal([]byte(content), &ocrPage); err != nil { |
403 | | - fmt.Printf("Error unmarshalling HOCR data: %s\n", err) |
404 | | - return nil, err |
| 404 | + return nil, fmt.Errorf("error unmarshalling HOCR data: %w", err) |
405 | 405 | } |
406 | 406 |
|
407 | 407 | return &ocrPage, nil |
|
0 commit comments