Page boundary detection in historical documents
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

34 lines
738B

  1. import sys
  2. import numpy as np
  3. import cv2
  4. import os
  5. if len(sys.argv) < 4:
  6. print("python %s manifest.txt dataset_dir out_dir" % __file__)
  7. exit()
  8. manifest_file = sys.argv[1]
  9. dataset_dir = sys.argv[2]
  10. out_dir = sys.argv[3]
  11. try:
  12. os.makedirs(out_dir)
  13. except:
  14. pass
  15. file_list = [s.strip() for s in open(manifest_file, 'r').readlines()]
  16. for line in file_list:
  17. tokens = line.split(',')
  18. f = tokens[0]
  19. coords = list(map(float, tokens[1:9]))
  20. resolved = os.path.join(dataset_dir, f)
  21. im = cv2.imread(resolved, 0)
  22. gt = np.zeros(im.shape, dtype=np.uint8)
  23. cv2.fillPoly(gt, np.array(coords).reshape((4, 2)).astype(np.int32)[np.newaxis,:,:], 255)
  24. out_fn = os.path.join(out_dir, f.replace('/', '_'))[:-4] + ".png"
  25. cv2.imwrite(out_fn, gt)