#!/usr/bin/python3

#
# pdfselect
#
# Copyright (C) 2022-2023 by John Heidemann <johnh@isi.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#


import argparse
import sys
import pdb
#        pdb.set_trace()


__version__ = '1.0'
__releasedate__ = '2023-08-12'

# PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter, PageRange

class HumanPDFPageList:
    """
    A list-like representation of human page ranges.
    Just like PyPDF2.PageRange, but a list not a slice,
    and 1-based, not 0-based,
    and missing other fancy options.
    """

    def __init__(self, pdf_doc, spec):
        """
        Initialize with a string (only), like "begin,3-5,8-end"
        """
        # xxx: this function is greedy.  ideally it would be lazy.
        self._list = []
        self._pdf_doc = pdf_doc   # context
        for group in spec.split(","):
            parts = group.split("-")
            if len(parts) == 1:
                self._list.append(self._part_to_int(parts[0], "begin"))
            elif len(parts) == 2:
                self._list.extend(range(self._part_to_int(parts[0], "begin"), self._part_to_int(parts[1], "end")+1))
            else:
                raise ParseError(group)

    def __iter__(self):
        yield from self._list

    def _part_to_int(self, part, default):
        """
        Convert a page number or name to the number (or default).
        Handles "begin" and "end" based on context of the document we're in,
        and _N means from the end (so _1 is the same as end).
        """
        if part is None or part == '':
            part = default
        from_end = False
        if part[0] == '_':
            from_end = True
            part = part.lstrip("_")
        page = None
        if part == "begin":
            page = 1
        elif part == "end":
            page = len(self._pdf_doc.pages)
        else:
            page = int(part)
        if from_end:
            page = len(self._pdf_doc.pages)+1 - page
        return page

    def to_list(self):
        """
        Return the list of pages.
        """
        return self._list
    

class Program:
    def __init__(self):
        self.parse_args()
        self.select()

    def parse_args(self):
        parser = argparse.ArgumentParser(description = 'select PDF pages from a document', epilog="""
Select pages from a PDF file (first argument),
writing to an output file (second argument),
like psselect, but for pdf.
(For people who are sad they can no longer install pdftk.)

Defaults to writing to stdout and reading from stdin if arguments are omittted.

OPTIONS:
    -p PAGES, --pages=PAGES
        list the pages to extract, like begin-2,5-7,10-end
        (Unlike psselect, we do not support -R as a synonym.)

    -r    reverse order

    -e    even pages only

    -o    odd pages only

    -h, --help    show help

    --version  show version

    -v     more verbose output (different than psselect)
        
(Unlike psselect, we don't output anything ever.
Nor do we have a -q --quiet option to suppress output.)        

EXAMPLE:

    pdfselect -p 2-3 source.pdf dest.pdf

THANKS
        Thanks to the PyPDF2 library that does the actual work,
        and to psselect for a reasonable UI.

AUTHOR
        Copyright (C) 2022-2023 by John Heidemann <johnh@isi.edu>
        
        Pdfselect is licensed under the GNU General Public License,
        version 2, as published by the Free Software Foundation.
        """)
        # see https://docs.python.org/3/library/argparse.html
        #  ArgumentParser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])

        parser.add_argument('--pages', '-p', help='page specification to extract', default = '-')
        parser.add_argument('--debug', help='debugging mode', action='store_true', default = False)
        parser.add_argument('--verbose', '-v', action='count')
        parser.add_argument('--even', '-e', help='select even pages only', action='store_const', const='even', dest='downselect')
        parser.add_argument('--odd', '-o', help='select odd pages only', action='store_const', const='odd', dest='downselect', default=None)
        parser.add_argument('--reverse', '-r', help='reverse page order', action='store_true', default=None)
        parser.add_argument('--version', help='show version', action='version', version='pdfselect ' + __version__ + ' ' + __releasedate__)
        parser.add_argument('input_path', default = '/dev/fd/0')
        parser.add_argument('output_path', default = None)
        args = parser.parse_args()
        #  focus_place = args.focus
        #  focus_day = args.day
        #  output_file = args.output
        #  output_type = args.type
        self.args = args

    def select(self):
        reader = PdfFileReader(self.args.input_path)
        writer = PdfFileWriter()

        page_list = HumanPDFPageList(reader, self.args.pages)
        if self.args.reverse is not None:
            page_list = list(page_list)
            page_list.reverse()
        for human_page in page_list:
            if self.args.downselect is not None:
                if (self.args.downselect == 'even' and human_page % 2 == 1) or (self.args.downselect == 'odd' and human_page %2 == 0):
                    continue
            if human_page < 0 or human_page > len(reader.pages):
                print(f"pdfselect: ignoring page {human_page} out of range")
                continue
            writer.addPage(reader.pages[human_page - 1])

        output = None
        if self.args.output_path:
            output = open(self.args.output_path, "wb")
        else:
            stdout.flush()
            output = os.fdopen(stdout.fileno(), "wb")
        writer.write(output)
            

if __name__ == '__main__':
    Program()
    sys.exit(0)