package com.ldodds.more;

import java.io.*;
import java.util.*;

import com.hp.hpl.mesa.rdf.jena.model.Property;
import com.hp.hpl.mesa.rdf.jena.model.Resource;
import com.hp.hpl.mesa.rdf.jena.model.Model;
import com.hp.hpl.mesa.rdf.jena.model.ResIterator;
import com.hp.hpl.mesa.rdf.jena.model.Statement;
import com.hp.hpl.mesa.rdf.jena.model.RDFNode;
import com.hp.hpl.mesa.rdf.jena.mem.ModelMem;

import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.poifs.eventfilesystem.*;

/**
 * Microsoft Office RDF Extractor
 *
 * <p>
 * MORE is a command-line application for extracting RDF metadata from the 
 * properties embedded in Microsoft Office documents. Access to the embedded 
 * data is provided using the POI HPSF API.</p>
 * 
 * <p>
 * The data is essentially name-value pairs with the names either being defined 
 * internally by the HPSF API (for core Office document metadata) or by the user for 
 * custom properties. These names are mapped to RDF properties using a mapping 
 * document: an RDF schema that contains a list of RDF properties, each of which 
 * is annotated with the property name embedded in the document. The application 
 * can then determine the correct statement to create for each property value using 
 * this information. 
 * </p>
 * 
 * <p>
 * For details of how to run the application see the usage() method.
 * </p>
 * 
 * <a href="http://pesenki.by.ru/internat/wonderstuff/songs/10.shtml?lg=en">Give Give Give, Me More More More...</a>
 *
 * @author ldodds
 */
public class MORE implements POIFSReaderListener
{
    private Map _pid2Property;
    private Model _model;
    private Writer _writer;
    private List _files;
    private Resource _document;   
                
	public MORE()
	{
        _files = new ArrayList();
        _model = new ModelMem();        
        try
        {
            _writer = new OutputStreamWriter(System.out);
        } catch (Exception e)
        {
            e.printStackTrace();
        }  
	}

    /**
     * Adds a file to the internal list of files that this MORE instance will process
     * @param file the filename of the file to process
     */
    public void addFileToProcess(String file)
    {
        _files.add(file);
    }
    
    /**
     * Instructs the MORE instance to load its configuration from the specified filename
     * @param config the file name of the RDF Schema defining the mapping
     */
    public void loadConfig(String config)
    {
        _pid2Property = getPIDDictionary(config);        
    }
    
    /**
     * Causes the MORE instance to load data from an existing RDF file. Used to
     * add statements to an existing file
     * @param file the existing file name
     */
    public void loadRDF(String file)
    {
        try
        {
            _model.read(new FileReader(file), "");
        } catch (Exception e)
        {
            e.printStackTrace();
        }
    }
    
    /**
     * Sets the Writer that the MORE instance will used to serialize the RDF data
     * @param writer a writer instance
     */
    public void setWriter(Writer writer)
    {
        _writer = writer;
    }
    
    /**
     * Causes the instance to iterate through its list of files. These will be parsed in 
     * turn by the POI API, with each document becoming a separate RDF resource.
     * As properties are extracted from the document, the mapping schema is used to 
     * determine the correct RDF property to add to the document resource. 
     */
    public void process()
    {
        Iterator iter = _files.iterator();
        while (iter.hasNext())
        {
            File file = new File( (String)iter.next() );
            try
            {
                _document = _model.createResource(file.getAbsolutePath());
                POIFSReader r = new POIFSReader();
                r.registerListener( this );
                r.read(new FileInputStream(file));
            } catch (Exception e)
            {
                e.printStackTrace();            
            }
        }
    }
    
    /**
     * Load the RDF mapping schema into a Map of property names -> RDF Properties 
     * 
     * @param config the path to the configuration file
     * @return a Map of RDF properties, keyed under their HPSF property name
     */
    private Map getPIDDictionary(String config)
    {
        HashMap dict = new HashMap();
        try
        {
            Model model = new ModelMem();
            model.read( new FileReader(config), "");
            Property pidStringProperty = model.createProperty("http://www.ldodds.com/ns/more#", "pidString");
            ResIterator iter = model.listSubjectsWithProperty(pidStringProperty);
            while (iter.hasNext())            
            {                
                Resource resource = iter.next();
                Statement pidStatement = resource.getProperty(pidStringProperty);                
                dict.put(pidStatement.getLiteral().getString(), 
                             model.createProperty(resource.getNameSpace(), resource.getLocalName()));
            }
        } catch (Exception e)
        {
            e.printStackTrace();
        }
        return dict;
    }
    
    /**
     * Dump the generated RDF data to the Writer provided in the call to 
     * setWriter (or System.out as a default)
     */    
    public void dumpRDF()
    {
        try
        {
            _model.write(_writer);
        } catch (Exception e)
        {
            e.printStackTrace();
        }
    }

	public static void main(String[] args)
	{
		try
		{
            MORE more = new MORE();
            configure(more, args);
            more.process();
            more.dumpRDF();
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

    /**
     * Process the command-line arguments and configure the provided MORE instance 
     * according to the switches and file names provided.
     * 
     * @param more the instance to configure
     * @param args the command-line arguments.
     */
    private static void configure(MORE more, String[] args)
    {
        if (args.length == 0)
        {
            more.usage();
            System.exit(1);
        }
        
        for (int i=0; i<args.length; i++)
        {
            String arg = args[i];
            if (arg.equals("-h") || arg.equals("-help"))
            {
                more.usage();
                System.exit(0);
            }
            if (arg.equals("-a") || arg.equals("-annotate"))
            {
                more.loadRDF(args[++i]);
                continue;
            }
            if (arg.equals("-m") || arg.equals("-mapping"))
            {
                more.loadConfig(args[++i]);
                continue;
            }
            if (arg.equals("-f") || arg.equals("-file"))
            {
                try
                {
                    more.setWriter( new FileWriter(args[++i]));
                } catch (Exception e)
                {
                    e.printStackTrace();
                }
                continue;                
            }
            if (arg.startsWith("-"))
            {
                System.out.println("Ignoring unknown switch: " +arg);
            }
            more.addFileToProcess(arg);
        }
    }
        
    /**
     * Dump usage message to console
     */        
    private void usage()
    {
        String[] lines = {
            "Microsoft Office RDF Extractor",
            "",
            "Usage: java com.ldodds.more.MORE [options] file(s)",
            "",
            "[options]",
            " -m path to pid mapping schema",
            " -f output file",
            " -a annotate existing file"            
        };
        
        for (int i=0; i<lines.length;i++)
        {
            System.out.println(lines[i]);
        }
    } 
         
	/**
     * Call-back method registered with the POI API. This does the actual processing.
     * 
     * Firstly attempts to get the PropertySet from the stream in the callback. If this 
     * isn't a POI property stream then we continue processing.
     * 
     * Otherwise we get all sections in the stream and iterate through all of their properties
     * Each property is checked against the internal map created by <code>getPIDDictionary</code>
     * If the property is present in the map then a new RDF statement is created for 
     * the current document resource. Everything is assumed to be an RDF Literal unless its 
     * starts with "http" or "mailto" in which case its assumed to be a Resource.
     * 
	 * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(POIFSReaderEvent)
	 */
	public void processPOIFSReaderEvent(POIFSReaderEvent event)
	{
        PropertySet propertySet = getPropertySet(event);
        if (propertySet == null)
        {
            return;
        }        
         
        List sections = propertySet.getSections();
        for (Iterator iter = sections.iterator(); iter.hasNext();)
		{
			Section section = (Section) iter.next();
			org.apache.poi.hpsf.Property[] properties = section.getProperties();
            
            for (int i = 0; i < properties.length; i++)
			{
			    String pid = section.getPIDString( properties[i].getID() );
                Property rdfProperty = (Property)_pid2Property.get( pid );
                String value = properties[i].getValue().toString();
                if (rdfProperty != null && value.length() != 0)
                { 
                    try
                    {
                        RDFNode node = getNode(value);                        
                        _model.add(_document, rdfProperty, node );
                    } catch (Exception e)
                    {
                        e.printStackTrace();
                    }
                }
			}            
		}       
	}

    /**
     * Create the correct type of node -- Resource, Literal -- for a given property value
     * @param value the property value
     * @return a Resource or Literal instance
     * @throws Exception if Jena throws an exception
     */
    public RDFNode getNode(String value) throws Exception
    {
        if (value.startsWith("http") || value.startsWith("mailto"))
        {
            return _model.createResource(value);
        }
        return _model.createLiteral(value);
    }
    
	/**
	 * Creates a property set if possible.
     * 
	 * @param event the callback event
	 * @return PropertySet the property set initialised from the stream or null if the 
     * stream doesn't contain properties.
     * @throws RuntimeException if I/O or property set problems encountered
	 */
	private PropertySet getPropertySet(POIFSReaderEvent event)
	{
        try
        {
            return PropertySetFactory.create(event.getStream());
        }
        catch (NoPropertySetStreamException ex)
        {
            //not a property set stream, or malformed. Stop here
            return null;
        }
        catch (Exception ex)
        {
            throw new RuntimeException("Property set stream: "
                    + event.getPath()
                    + event.getName()
                    + ": "
                    + ex);
        }

	}


}
