#!/usr/bin/python3 # # pdfselect # # Copyright (C) 2022-2023 by John Heidemann # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License, # version 2, as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # import argparse import sys import pdb # pdb.set_trace() __version__ = '1.0' __releasedate__ = '2023-08-12' # PyPDF2 from PyPDF2 import PdfFileReader, PdfFileWriter, PageRange class HumanPDFPageList: """ A list-like representation of human page ranges. Just like PyPDF2.PageRange, but a list not a slice, and 1-based, not 0-based, and missing other fancy options. """ def __init__(self, pdf_doc, spec): """ Initialize with a string (only), like "begin,3-5,8-end" """ # xxx: this function is greedy. ideally it would be lazy. self._list = [] self._pdf_doc = pdf_doc # context for group in spec.split(","): parts = group.split("-") if len(parts) == 1: self._list.append(self._part_to_int(parts[0], "begin")) elif len(parts) == 2: self._list.extend(range(self._part_to_int(parts[0], "begin"), self._part_to_int(parts[1], "end")+1)) else: raise ParseError(group) def __iter__(self): yield from self._list def _part_to_int(self, part, default): """ Convert a page number or name to the number (or default). Handles "begin" and "end" based on context of the document we're in, and _N means from the end (so _1 is the same as end). """ if part is None or part == '': part = default from_end = False if part[0] == '_': from_end = True part = part.lstrip("_") page = None if part == "begin": page = 1 elif part == "end": page = len(self._pdf_doc.pages) else: page = int(part) if from_end: page = len(self._pdf_doc.pages)+1 - page return page def to_list(self): """ Return the list of pages. """ return self._list class Program: def __init__(self): self.parse_args() self.select() def parse_args(self): parser = argparse.ArgumentParser(description = 'select PDF pages from a document', epilog=""" Select pages from a PDF file (first argument), writing to an output file (second argument), like psselect, but for pdf. (For people who are sad they can no longer install pdftk.) Defaults to writing to stdout and reading from stdin if arguments are omittted. OPTIONS: -p PAGES, --pages=PAGES list the pages to extract, like begin-2,5-7,10-end (Unlike psselect, we do not support -R as a synonym.) -r reverse order -e even pages only -o odd pages only -h, --help show help --version show version -v more verbose output (different than psselect) (Unlike psselect, we don't output anything ever. Nor do we have a -q --quiet option to suppress output.) EXAMPLE: pdfselect -p 2-3 source.pdf dest.pdf THANKS Thanks to the PyPDF2 library that does the actual work, and to psselect for a reasonable UI. AUTHOR Copyright (C) 2022-2023 by John Heidemann Pdfselect is licensed under the GNU General Public License, version 2, as published by the Free Software Foundation. """) # see https://docs.python.org/3/library/argparse.html # ArgumentParser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest]) parser.add_argument('--pages', '-p', help='page specification to extract', default = '-') parser.add_argument('--debug', help='debugging mode', action='store_true', default = False) parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--even', '-e', help='select even pages only', action='store_const', const='even', dest='downselect') parser.add_argument('--odd', '-o', help='select odd pages only', action='store_const', const='odd', dest='downselect', default=None) parser.add_argument('--reverse', '-r', help='reverse page order', action='store_true', default=None) parser.add_argument('--version', help='show version', action='version', version='pdfselect ' + __version__ + ' ' + __releasedate__) parser.add_argument('input_path', default = '/dev/fd/0') parser.add_argument('output_path', default = None) args = parser.parse_args() # focus_place = args.focus # focus_day = args.day # output_file = args.output # output_type = args.type self.args = args def select(self): reader = PdfFileReader(self.args.input_path) writer = PdfFileWriter() page_list = HumanPDFPageList(reader, self.args.pages) if self.args.reverse is not None: page_list = list(page_list) page_list.reverse() for human_page in page_list: if self.args.downselect is not None: if (self.args.downselect == 'even' and human_page % 2 == 1) or (self.args.downselect == 'odd' and human_page %2 == 0): continue if human_page < 0 or human_page > len(reader.pages): print(f"pdfselect: ignoring page {human_page} out of range") continue writer.addPage(reader.pages[human_page - 1]) output = None if self.args.output_path: output = open(self.args.output_path, "wb") else: stdout.flush() output = os.fdopen(stdout.fileno(), "wb") writer.write(output) if __name__ == '__main__': Program() sys.exit(0)