#include "cl_config.h"
#include "defaults.h"
#include "message.h"
#include "phrase.h"
#include "strdup.h"
#include "wchar_utils.h"

#include <libxml/tree.h>
#include <libxml/xpath.h>
#include <libxslt/xsltInternals.h>
#include <libxslt/transform.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
#include <popt.h>
#include <string.h>
#ifdef TRISH2
#  include <patricia_search.h>
#  include <trish_dict.h>
#else
#  include <dictionary.h>
#  include <patricia_node.h>
#endif

#include "pedantic_macros.h"
#include "mouchard.h"

int verbosity = DEFAULT_VERBOSITY;
int severity = DEFAULT_SEVERITY;

static char *separator = DEFAULT_SEPARATOR;
static int lemma = 0;
static int max_length = DEFAULT_MAXLENGTH;
static int density = DEFAULT_DENSITY;
static char *suffix = DEFAULT_SUFFIX;

struct poptOption cl_options[] = {
  POPT_AUTOHELP
  { NULL , '\0' , POPT_ARG_INCLUDE_TABLE     , verb_opts , 0 , "Verbosity options:" , NULL },
  { CLO_LONG_SEPARATOR , CLO_SHORT_SEPARATOR , POPT_ARG_STRING , &separator  , 0               , CLO_HELP_SEPARATOR , "SEP" },
  { CLO_LONG_MAXLENGTH , CLO_SHORT_MAXLENGTH , POPT_ARG_INT    , &max_length , 0               , CLO_HELP_MAXLENGTH , "LENGTH" },
  { CLO_LONG_DENSITY   , CLO_SHORT_DENSITY   , POPT_ARG_INT    , &density    , 0               , CLO_HELP_DENSITY   , "DENSITY" },
  { CLO_LONG_LEMMA     , CLO_SHORT_LEMMA     , POPT_ARG_NONE   , &lemma      , 0               , CLO_HELP_LEMMA     , NULL },
  { CLO_LONG_SUFFIX    , CLO_SHORT_SUFFIX    , POPT_ARG_STRING , &suffix     , 0               , CLO_HELP_SUFFIX    , "SUFFIX" },
  POPT_TABLEEND
};

void add_term(xmlDoc *doc, xmlNode *su_level, xmlNode *phrase_level, int *next_term_id, int *next_phrase_id, phrase_t *ph) {
  xmlNode *su = xmlNewDocNode(doc, NULL, BAD_CAST "semantic_unit", NULL);
  xmlNode *term = xmlNewDocNode(doc, NULL, BAD_CAST "term", NULL);
  xmlNode *cform = ph->label ? xmlNewDocNode(doc, NULL, BAD_CAST "canonical_form", BAD_CAST ph->label) : NULL;
  xmlNode *form = xmlNewDocNode(doc, NULL, BAD_CAST "form", BAD_CAST ph->form);
  char *id = calloc(strlen(DEFAULT_TERM_ID_PREFIX) + 10,sizeof(char));
  xmlNode *id_node;

  sprintf(id, DEFAULT_TERM_ID_PREFIX "%d", *next_term_id);
  *next_term_id = *next_term_id + 1;
  id_node = xmlNewDocNode(doc, NULL, BAD_CAST "id", BAD_CAST id);

  xmlAddChild(su_level, su);
  xmlAddChild(su, term);
  xmlAddChild(term, cform);
  xmlAddChild(term, form);
  xmlAddChild(term, id_node);

  if (ph->n_tokens == 1) {
    xmlNode *refid_word = xmlNewDocNode(doc, NULL, BAD_CAST "refid_word", BAD_CAST ph->tokens[0]);
    xmlAddChild(term, refid_word);
  }
  else {
    xmlNode *phrase = xmlNewDocNode(doc, NULL, BAD_CAST "phrase", NULL);
    char *ph_id = calloc(strlen(DEFAULT_PHRASE_ID_PREFIX) + 10,sizeof(char));
    xmlNode *ph_id_node;
    xmlNode *list_refid_components = xmlNewDocNode(doc, NULL, BAD_CAST "list_refid_components", NULL);
    xmlNode *refid_phrase;
    int i;

    sprintf(ph_id, DEFAULT_PHRASE_ID_PREFIX "%d", *next_phrase_id);
    *next_phrase_id = *next_phrase_id + 1;
    ph_id_node = xmlNewDocNode(doc, NULL, BAD_CAST "id", BAD_CAST ph_id);

    for (i = 0; i < ph->n_tokens; i++) {
      xmlNode *refid_word = xmlNewDocNode(doc, NULL, BAD_CAST "refid_word", BAD_CAST ph->tokens[i]);
      xmlAddChild(list_refid_components, refid_word);
    }

    xmlAddChild(phrase, ph_id_node);
    xmlAddChild(phrase, list_refid_components);
    xmlAddChild(phrase_level, phrase);

    refid_phrase = xmlNewDocNode(doc, NULL, BAD_CAST "refid_phrase", BAD_CAST ph_id);
    xmlAddChild(term, refid_phrase);

    free(ph_id);
  }

  free(id);
}






int main(int argc, const char **argv) {
  int opth;
  int i, j;
  FILE *f;
  const char *dict_fn;
  tdict_t *dict;
  poptContext context = poptGetContext(NULL, argc, argv, cl_options, 0);

  poptSetOtherOptionHelp(context, "DICT FILE [FILES]");

  while ((opth = poptGetNextOpt(context)) > 0) {
    switch (opth) {
    SWITCH_VERBOSITY
    }
  }
  if (opth != -1) {
    message(VERBOSITY_QUIET | SEVERITY_FATAL, "%s %s", poptStrerror(opth), poptBadOption(context, 0));
  }

  if (poptPeekArg(context)) {
    dict_fn =  poptGetArg(context);
    if ((f = fopen(dict_fn, "r"))) {
      dict = tdict_load(f);
      fclose(f);
    }
    else {
      message(VERBOSITY_QUIET | SEVERITY_FATAL, "Could not load dictionary '%s'", dict_fn);
    }
  }
  else {
    message(VERBOSITY_QUIET | SEVERITY_FATAL, "You must provide a dictionary file name");
  }

  if (poptPeekArg(context)) {
    const char *alvis_fn;
    while ((alvis_fn = poptGetArg(context))) {
      char *in_dot = NULL;
      char *out_fn = NULL;
      xmlDoc *alvis_doc = xmlReadFile(alvis_fn, NULL, 0);
      xmlDoc *in_doc = NULL;
      xmlNode *la = NULL;
      xmlNode *su_level = NULL;
      xmlNode *phrase_level = NULL;

      int old_sub = xmlSubstituteEntitiesDefault(1);
      int old_led = xmlLoadExtDtdDefaultValue;
      xsltStylesheet *ss = xsltParseStylesheetFile(ALVIS_TO_BIOTERMTAGGER_XSLT);

      xmlXPathContext *alvis_xpc;
      xmlXPathObject *alvis_xpo;


      message(VERBOSITY_NORMAL | SEVERITY_INFO, "Opening file '%s'", alvis_fn);

      if (alvis_doc == NULL) {
	message(VERBOSITY_QUIET | SEVERITY_ERROR, "Could not open file '%s'", alvis_fn);
      }
      
      
      if (ss == NULL) {
	message(VERBOSITY_QUIET | SEVERITY_FATAL, "Could not open file '" ALVIS_TO_BIOTERMTAGGER_XSLT "'");
      }


      /* Look for linguistic analysis and semantic_unit_level */
      alvis_xpc = xmlXPathNewContext(alvis_doc);
      xmlXPathRegisterNs(alvis_xpc, BAD_CAST "a", BAD_CAST ALVIS_NS);
      alvis_xpo = xmlXPathEvalExpression(BAD_CAST "/a:documentCollection/a:documentRecord/a:linguisticAnalysis", alvis_xpc);
      if (!(alvis_xpo && alvis_xpo->nodesetval)) {
	message(VERBOSITY_NORMAL | SEVERITY_WARNING, "No linguistic analysis in '%s'", alvis_fn);
	xmlXPathFreeObject(alvis_xpo);
	xmlXPathFreeContext(alvis_xpc);
	xmlFreeDoc(alvis_doc);
	xmlCleanupParser();
	break;
      }
      la = alvis_xpo->nodesetval->nodeTab[0];
      xmlXPathFreeObject(alvis_xpo);

      alvis_xpo = xmlXPathEvalExpression(BAD_CAST "/a:documentCollection/a:documentRecord/a:linguisticAnalysis/a:semantic_unit_level", alvis_xpc);
      if (alvis_xpo && alvis_xpo->nodesetval && alvis_xpo->nodesetval->nodeNr) {
	su_level = alvis_xpo->nodesetval->nodeTab[0];
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Found semantic_unit_level");
      }
      else {
	su_level = xmlNewDocNode(alvis_doc, NULL, BAD_CAST "semantic_unit_level", NULL);
	xmlAddChild(la, su_level);
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Added semantic_unit_level");
      }
      xmlXPathFreeObject(alvis_xpo);

      alvis_xpo = xmlXPathEvalExpression(BAD_CAST "/a:documentCollection/a:documentRecord/a:linguisticAnalysis/a:phrase_level", alvis_xpc);
      if (alvis_xpo && alvis_xpo->nodesetval && alvis_xpo->nodesetval->nodeNr) {
	phrase_level = alvis_xpo->nodesetval->nodeTab[0];
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Found phrase_level");
      }
      else {
	phrase_level = xmlNewDocNode(alvis_doc, NULL, BAD_CAST "phrase_level", NULL);
	xmlAddPrevSibling(su_level, phrase_level);
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Added phrase_level");
      }
      xmlXPathFreeObject(alvis_xpo);
      xmlXPathFreeContext(alvis_xpc);


      /* Apply transform */
      xmlLoadExtDtdDefaultValue = 1;
      in_doc = xsltApplyStylesheet(ss, alvis_doc, NULL);
      xsltFreeStylesheet(ss);
      xsltCleanupGlobals();
      xmlSubstituteEntitiesDefault(old_sub);
      xmlLoadExtDtdDefaultValue = old_led;
      message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Transformation applied");


      /* Search terms */
      if (in_doc != NULL) {
	psearch_t *search = psearch_new(dict->map, PSEARCH_OPTION_FULL | PSEARCH_OPTION_CI_ALL, NULL, density);
	xmlXPathContext *in_xpc = xmlXPathNewContext(in_doc);
	xmlXPathObject *in_xpo = xmlXPathEvalExpression("/BioTermTagger/token-list/t", in_xpc);
	phrase_t *last_ph = NULL;
	phrase_t *current_ph = NULL;
	int last_i = 0;
	int last_j = 0;
	int next_term_id = 0;
	int next_phrase_id = 0;

	for (i = 0; i < in_xpo->nodesetval->nodeNr; i++) {
	  for (j = 1; (j <= max_length) && (i + j - 1 < in_xpo->nodesetval->nodeNr); j++) {
	    int k = i + j - 1;
	    char *w = (char *)in_xpo->nodesetval->nodeTab[k]->children->content;
	    char *id = (char *)xmlHasProp(in_xpo->nodesetval->nodeTab[k], "id")->children->content;
	    char *lemma = (char *)xmlHasProp(in_xpo->nodesetval->nodeTab[k], "lemma")->children->content;

	    if (j == 1) {
	      phrase_free(current_ph);
	      current_ph = phrase_new(w, lemma, id);
	    }
	    else {
	      phrase_extension(current_ph, w, lemma, separator, id);
	    }
	    
	    if (last_ph && (i >= last_i + last_j)) {
	      message(VERBOSITY_LOCACE | SEVERITY_INFO, "Getting out of the last term range, registering '%s'", last_ph->form);
	      add_term(alvis_doc, su_level, phrase_level, &next_term_id, &next_phrase_id, last_ph);
	      phrase_free(last_ph);
	      last_ph = NULL;
	    }
	    
	    if ((i >= last_i + last_j) || (j > last_j)) {
	      wchar_t *w = char2wcs(lemma ? current_ph->lemma : current_ph->form);
	      psearch_reinit(search);
	      psearch_new_query(search, w);
	      patricia_search(search);
	      message(VERBOSITY_VERBOSE | SEVERITY_INFO, "Searching for '%s': %s", lemma ? current_ph->lemma : current_ph->form, search->final ? "yes" : "no");
	      if (search->final) {
		phrase_free(last_ph);
		last_ph = phrase_copy(current_ph);
		last_ph->label = wcs2char(((wchar_t **)search->final->node->data)[0]);
		last_i = i;
		last_j = j;
	      }
	      free(w);
	    }
	  }
	}
	phrase_free(current_ph);
	phrase_free(last_ph);
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  Found %d terms, %d phrases", next_term_id, next_phrase_id);


	/* Dump file */
	if ((in_dot = strrchr(alvis_fn, '.'))) {
	  *in_dot = '\0';
	}
	out_fn = calloc(strlen(suffix) + strlen(alvis_fn) + 1, sizeof(char));
	strcat(out_fn, alvis_fn);
	strcat(out_fn, suffix);
	f = fopen(out_fn, "w");
	if (f == NULL) {
	  message(VERBOSITY_QUIET | SEVERITY_WARNING, "Could not open '%s' for writing", out_fn);
	}
	else {
	  xmlDocDump(f, alvis_doc);
	  fclose(f);
	}
	message(VERBOSITY_NORMAL | SEVERITY_INFO, "  File '%s' written", out_fn);
	free(out_fn);


	psearch_free(search);
	xmlXPathFreeObject(in_xpo);
	xmlXPathFreeContext(in_xpc);
	xmlFreeDoc(in_doc);
	xmlFreeDoc(alvis_doc);
	xmlCleanupParser();
      }
    }
  }
  else {
    message(VERBOSITY_QUIET | SEVERITY_FATAL, "You must provide a file name");
  }

  poptFreeContext(context);

  tdict_free(dict);

  return 0;
}
