#!/usr/bin/env python3
#-------------------------------------------------------------------------------
# t a s k f i l e . p y   --   read, parse and generate task files
#-------------------------------------------------------------------------------
"""
This module manages "task files", files which describe the images to be used
for training and testing vision systems on particular problems.  There are
routines for loading them, either from local files or over the Web, and
returning the information in them for use in programs.  It is also possible
to output task files.

The module can also be used as a stand-alone program for reading and reporting
the content of these task files.  An example (very short) task file is:

  # Adrian F Clark <alien@essex.ac.uk> 2020-04-28

  name: 5sh5col-shape
  type: vision
  purpose:
    Classification of images of coloured shapes on the basis of shape.

  class:
    Background  #000000
    square      #ff0000
    star        #00ff00
    flower      #0000ff
    leaf        #ffff00
    snowflake   #ff00ff

  property:
    annotation marked

  dataset: train
    5sh5col-01.jpg smask-5sh5col-01.png
    5sh5col-02.jpg smask-5sh5col-02.png
    5sh5col-03.jpg smask-5sh5col-03.png
    5sh5col-04.jpg smask-5sh5col-04.png
    5sh5col-05.jpg smask-5sh5col-05.png

  dataset: test
    5sh5col-06.jpg smask-5sh5col-06.png
    5sh5col-07.jpg smask-5sh5col-07.png
    5sh5col-08.jpg smask-5sh5col-08.png
    5sh5col-09.jpg smask-5sh5col-09.png
    5sh5col-10.jpg smask-5sh5col-10.png

Blank lines and those starting with "#" are ignored.  Lines that start with a
non-whitespace character in the first column and contain a colon character
introduce a section of the file, the section name being the text up to the
colon.  The contents of the section can either follow the name (as in "type"
above) or can be provided on subsequent lines which start with whitespace
characters (as in "train" above).  Section names should be in lower case.
The following sections are supported:

name: a simple mnemonic that describes this task uniquely

type: the nature of the vision task, one of:

   "label"   assign a single textual label to the entire image
             (used for MNIST, for example)

   "vision"  a mask image shows which class of feature appears where for
             every training or test image (as in the shape example above)

purpose: a description of the nature of the task

class: a series of pairs of inputs, giving the name of a class and the colour
   used to represent which pixels in training images contain that class

dataset: a dataset used for training, testing, etc; its name follows the
   keyword and the actual data follow on subsequent lines, though the
   information provides is different for different classes:

   "label"   a single word giving the textual label for the image

   "vision " an image containing pixel-level ground truth

   Typical datasets are "train" and "test".

property: optional properties of the database which can be queried by a program

The "class" section need be provided only for "vision" tasks.  The "type"
section must precede the "dataset" sections.

Note that "vision" images used for training do not necessarily identify which
class every pixel belongs to, only a set of pixels or regions that its author
thought indicative of the classes.  However, a well-formed validation or test
image should classify correctly to the pixel level: this is because one is
often interested in how well the algorithms used to process the imagery perform
on every pixel, for example if the problem involves segmentation.  The property
value set in the above example is the conventional way of conveying this.

A task file is represented by this module as a Python dictionary indexed by
section name.  The contents of a section is either a string (name, type,
purpose), a list (class, dataset name), or a dict (property).  Where the
contents of a string section span multiple lines, they are concatenated as
they are read in.
"""
#-------------------------------------------------------------------------------
# REVISION HISTORY
#-------------------------------------------------------------------------------
# 2021-07-09  Original version.
# 2021-10-05  Revised to support the "dataset" syntax.
# 2021-10-27  Added support for "property" settings, refactored code.
# 2022-08-22  Some further refactoring for "vision" tasks.
# 2022-11-01  Do the right thing when some tags are undefined.
#-------------------------------------------------------------------------------

# Boilerplate.
import sys, argparse
from datetime import datetime

def error (msg, status=-1):
    "Output an error message and optionally die."
    print ("Error: " + msg, file=sys.stderr)
    if status >= 0:
        exit (status)

def load (fn):
    "Read and parse a task file, which may be a URL."
    # We'll unpack the task into the following dictionary.  Classes end up in
    # the "class" entry; the "__labels__" one is temporary storage.
    task = {
        "__datasets__": [],
        "__labels__": {},
        "name": "",
        "type": "",
        "purpose": "",
        "class": [],
        "property": {},
    }

    # If the "filename" is a URL, request it and store the lines that we
    # retrieve in variable "lines".  Otherwise, read the file's lines into
    # the same variable.
    if fn.startswith ("http://"):
        import urllib.request
        response = urllib.request.urlopen (fn)
        lines = response.read().decode("utf-8").split ("\n")
    else:
        f = open (fn)
        lines = f.readlines ()
        f.close ()

    # Process the individual lines.  The first line we encounter needs to be
    # the start of a chunk, so make sure its name isn't set to help us catch
    # erroneous input files.
    chunk = None
    for line in lines:
        # Remove trailing whitespace and handle blank lines and comments.
        line = line.rstrip ()
        if len (line) <= 0: continue
        if line[0] == "#": continue

        # Determine whether it's the start of a chunk or a continuation.
        if line[0].isspace ():
            # It's the continuation of a chunk already started.
            if chunk is None:
                error ("Task file '%s' doesn't start with a chunk name!" \
                       % fn, status=1)
            parse_task_chunk (task, chunk, line, True)
        else:
            # It's a new chunk.  Find the colon and handle the cases where it
            # is missing or at the start of the line.
            pos = line.find (":")
            if pos < 0:
                msg = "The following line should start a chunk but"
                msg += " has no colon:\n   %s"
                error (msg % line, status=2)
            chunk = line[:pos].strip ()
            if len (chunk) <= 0:
                msg = "The following line should start a chunk but"
                msg += " has no chunk name:\n   %s"
                error (msg % line, status=3)

            # Extract any remaining content from the line and assign it to the
            # new chunk.
            if pos+1 < len (line):
                parse_task_chunk (task, chunk, line[pos+1:], False)

    # Make sure we have a list of classes.  Remove any temporary book-keeping.
    if task["type"] == "label":
        task["class"] = sorted (task["__labels__"].keys ())
        del (task["__labels__"])

    # Do a bit of sanity-checking.
    if len (task["name"]) <= 0:
        error ("Task contains no name!", status=10)
    if len (task["name"]) <= 0:
        error ("Task contains no purpose!", status=0)
    if len (task["type"]) <= 0:
        error ("Task contains no type!", status=10)
    if task["type"] != "vision" and task["type"] != "label":
        error ("Only 'vision' and 'label' tasks are supported!", status=11)
    if len (task["__datasets__"]) <- 0:
        error ("There are no datasets in the task file!", status=12)

    # Return what we have loaded.
    return task

def list_numbers_in_classes (task, section):
    "Return the number of images in each class."

    # This currently makes sense only for "label" tasks.
    if task["type"] != "label":
        return ""

    # Build a dictionary indexed by the class label.
    pix = {}
    for lab in task["class"]:
        pix[lab] = []

    # Fill the list associated with each label.
    for fn, lab in task[section]:
        pix[lab] += [fn]

    # List the number of entries in each label's list.
    text = "    number  label\n"
    for lab in task["class"]:
        text += "%10d  %s\n" % (len (pix[lab]), lab)

    # Return what we have done.
    return text

def parse_task_chunk (task, chunk, line, contin):
    "Parse a line of a task definition file."

    # Strip off leading and trailing whitespace.
    text = line.strip ()

    # Handle the line according to the chunk.
    if chunk == "name" or chunk == "type" or chunk == "purpose":
        if contin:
            task[chunk] += text
        else:
            task[chunk] = text

    elif chunk == "class":
        # We want a series of [class, colour] pairs.
        words = text.split ()
        if len (words) != 2:
            error ("Problem with 'class' section of task file: '%s'" % text,
                   status=5)
        task[chunk] += [words]
        # Remember which classes we have been told about.
        #task["class"] += [words[1]]

    elif chunk == "property":
        # We want a series of [name, value] pairs.
        words = text.split ()
        if len (words) != 2:
            error ("Problem with 'property' section of task file: '%s'" % text,
                   status=5)
        task[chunk][words[0]] = words[1]

    elif chunk == "dataset":
        if contin:
            ds = task["__datasets__"][-1]
            # We want a series of [image, mask] or [image, label] pairs.
            words = text.split ()
            if len (words) != 2:
                error ("Problem with '%s' section of task file: '%s'" %
                       (dataset_name, text), status=4)
            task[ds] += [words]
            # Remember which classes we have.
            if task["type"] == "label":
                task["__labels__"][words[1]] = True

        else:
            words = text.split ()
            if len (words) != 1:
                error ("Problem with 'dataset' section of task file: '%s'" \
                       % text,  status=5)
            ds = words[0]
            task["__datasets__"] += [ds]
            task[ds] = []

    else:
        error ("Unknown chunk '%s' in task file" % chunk, status=15)

def review (task):
    "Review the contents of a taskfile, returning the result."

    # Output the name, purpose and type.
    text = ""
    text += task["name"] + ":\n"
    text += task["purpose"] + "\n"
    text += "(This is a " + task["type"] + " task.)\n"

    # List any properties that have been set.
    if len (task["property"].keys ()) > 0:
        text += "Properties:\n"
        for prop in sorted (task["property"].keys ()):
            text += "  " + prop + ": " + task["property"][prop] + "\n"

    # Finish off with the datasets available and, where sensible, the
    # number in each class.
    text += "Datasets: " + " " .join (task["__datasets__"]) + "\n"
    if task["type"] == "label":
        for ds in task["__datasets__"]:
            text += "Dataset " + ds + ":\n"
            text += list_numbers_in_classes (task, ds)
    return text[:-1]

def save (task, fn=None, note=None):
    "Output a task in the same form as those it can read in."
    now = datetime.now().strftime ("%Y-%m-%d at %H:%M:%S")
    text = "# Produced by %s on %s.\n" % (sys.argv[0], now)

    # If there was a note, output it followed by a blank line.
    if note:
        text += "# " + note + "\n"
    text += "\n"

    # Work through the introductory sections.
    for sec in ["name", "type", "purpose"]:
        text += "%s:\n  %s\n\n" % (sec, task[sec])

    # If there were properties, output them.
    if "property" in task and len (task["property"].keys ()) > 0:
        text += "property:\n"
        for key in sorted (task["property"].keys()):
            text += "  " + key + ": " + task["property"][key] + "\n"
        text += "\n"

    # If there were classes, output them.
    if task["type"] == "class":
        for line in task["class"]:
            text += "  " + " ".join (line) + "\n"
        text += "\n"

    # Finish off with the datasets.
    if "__datasets__" in task:
        for ds in task["__datasets__"]:
            text += "dataset: %s\n" % ds
            for line in task[ds]:
                text += "  %s %s\n" % (line[0], line[1])
            text += "\n"

    # Having generated the text, output it.
    if fn is None or fn == "-":
        print (text[:-1])
    else:
        with open (fn, "w") as f:
            print (text[:-1], file=f)

#-------------------------------------------------------------------------------
# Main program.
#-------------------------------------------------------------------------------
if __name__ == "__main__":

    # Handle the command line.
    parser = argparse.ArgumentParser (description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument ("taskfile",
                         help="file containing the vision task")
    parser.add_argument ("operation", nargs="?", default="review",
                         help="operation to perform on the taskfile")
    args = parser.parse_args()

    # Load the task file and pull out its sections.
    task = load (args.taskfile)

    # The operation is either one we handle here or the name of a section,
    # in which case we output the information line by line.
    if args.operation == "review":
        print (review (task))

    elif args.operation == "save":
        save (task)

    elif args.operation in task.keys ():
        if isinstance (task[args.operation], list):
            for line in task[args.operation]:
                print (" ".join (line))
        elif isinstance (task[args.operation], dict):
            for k in sorted (task[args.operation].keys()):
                print ("%s: %s" % (k, task[args.operation][k]))
        else:
            print (task[args.operation])

    else:
        error ("Unknown operation or taskfile section name.", status=99)

#-------------------------------------------------------------------------------
# End of taskfile.py
#-------------------------------------------------------------------------------
