<?xml version="1.0" encoding="UTF-8"?>
<!--
  * dih-ojs.xml
  *
  * Copyright (c) 2014-2023 Simon Fraser University
  * Copyright (c) 2003-2023 John Willinsky
  * Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
  *
  * OJS solr/lucene search plugin DataImportHandler default configuration.
  * 
  * NB: Independently of your deployment mode (embedded or central) you should
  * not have to change this configuration file. Be aware that changes may break
  * the OJS/solr communication protocol.
  -->
<dataConfig>
  <dataSource name="http-post" type="ContentStreamDataSource" />
  <dataSource name="field" type="FieldReaderDataSource" />
  <dataSource name="file" type="BinURLDataSource" />

  <script><![CDATA[
    /**
     * Some fields collect content from various entities.
     * These fields have to be stored in document rather than
     * entity context. This function is a helper function to
     * deal with the intricacies of document-level fields.
     *
     * @param Context context
     * @param String fieldName
     * @param String textToAppend
     */
    function appendToDocField(context, fieldName, textToAppend) {
      if (textToAppend === null) return;

      if (String.prototype.trim) {
        // JS 1.8.1+
        textToAppend = textToAppend.trim();
      } else {
        // JS prior to 1.8.1
        textToAppend = textToAppend.replace(/^\s\s*/, '').replace(/\s\s*$/, '');
      }
      if (textToAppend === '') return;

      var docFields = context.getSessionAttribute('docFields', 'document');
      if (docFields === null) {
        docFields = [];
      }
      if (docFields.indexOf(fieldName) === -1) {
        docFields.push(fieldName);
        context.setSessionAttribute('docFields', docFields, 'document');

        var fieldValue = textToAppend;
      } else {
        var fieldValue = context.getSessionAttribute(fieldName, 'document');
        fieldValue += ' ' + textToAppend;
      }
      context.setSessionAttribute(fieldName, fieldValue, 'document');
    }

    /**
     * Custom transformer for the article entitiy.
     *
     * @param Map row
     * @param Context context
     * @returns Map The transformed row.
     */
    function transformArticle(row, context) {
      // Hack: In JS prior to version 1.6 we need to fix the
      // Array prototype to implement the indexOf() method.
      // We do this here before instantiating the first array.
      if(!Array.prototype.indexOf) {
        Array.prototype.indexOf = function(needle) {
          for(var i = 0; i < this.length; i++) {
            if(this[i] === needle) {
              return i;
            }
          }
          return -1;
       };
      }

      // Prepare the galley and supp file XML. This must be
      // done before anything else. Otherwise we get an error
      // later if the fields created here cannot be found by
      // the entity processor.
      var galleyXml = row.get('etl_galley_xml');
      if (galleyXml == null) galleyXml = '<?xml version="1.0" encoding="utf-8"?><galleyList />';
      row.put('etl_galley_xml', String(galleyXml));

      // Check whether this article should actually be
      // deleted rather than updated (occurs in pull-indexing).
      // If that's the case then mark the line for deletion
      // and bypass all further processing.
      var instId = row.get('inst_id');
      var articleId = row.get('submission_id');
      var loadAction = String(row.get('loadAction'));
      if (loadAction === String('delete')) {
        row.put('$deleteDocById', instId + '-' + articleId);
        row.put('$skipDoc', true);
        return row;
      }

      // Make IDs unique across installations.
      var journalId = row.get('journal_id');
      row.put('submission_id', instId + '-' + articleId);
      row.put('journal_id', instId + '-' + journalId);

      // Add localized fields.
      var locFields = ['title', 'abstract', 'discipline', 'subject', 'type', 'coverage', 'journalTitle'];

      // Define how fields are being indexed.
      var searchFields = ['title', 'abstract', 'discipline', 'subject', 'type', 'coverage'];
      var indexTermsFields = ['discipline', 'subject', 'type', 'coverage'];
      var sortFields = ['title', 'journalTitle'];
      var facetFields = ['discipline', 'subject', 'type', 'coverage', 'journalTitle'];

      var indexTermsAutoSuggest = '';
      for (var i=0, len=locFields.length; i<len; i++) {
        var fieldName = locFields[i];

        // Get the value list.
        var valueList = 'etl_' + fieldName + 'List';
        var values = row.get(valueList);
        if (values == null) continue;
        row.remove(valueList);

        // Get the locale list.
        var localeList = 'etl_' + fieldName + 'List_locales';
        var locales = row.get(localeList);
        row.remove(localeList);

        // Some values may be for sorting only.
        var sortOnlyFlags = null;
        var sortOnlyList = 'etl_' + fieldName + 'List_sortOnly';
        if (row.containsKey(sortOnlyList)) {
          sortOnlyFlags = row.get(sortOnlyList);
          row.remove(sortOnlyList);
        } 

        // Run through all values.
        var autoSuggest = '';
        for(var j=0, numValues=values.size(); j<numValues; j++) {
          var locale = String(locales.get(j));
          var value = String(values.get(j));

          // Is this value for sorting only?
          var sortOnly = 'false';
          if (sortOnlyFlags !== null) {
            sortOnly = String(sortOnlyFlags.get(j))
          }

          // Index search and spelling/auto-suggest fields.
          if (searchFields.indexOf(fieldName) >= 0 && sortOnly === 'false') {
            row.put(fieldName + '_' + locale, value);
            // Build concatenated spelling field.
            appendToDocField(context, 'default_spell', value);
            // Build autoSuggest field.
            autoSuggest += ' ' + value;
          }

          // Index terms auto-suggest fields.
          if (indexTermsFields.indexOf(fieldName) >= 0 && sortOnly === 'false') {
          	indexTermsAutoSuggest += ' ' + value;
          }

          // Index sort fields.
          if (sortFields.indexOf(fieldName) >= 0) {
            row.put(fieldName + '_' + locale + '_txtsort', value);
          }

          // Index facet fields.
          if (facetFields.indexOf(fieldName) >= 0) {
            row.put(fieldName + '_' + locale + '_facet', value);
          }
        }
        row.put(fieldName + '_spell', autoSuggest);
      }
      row.put('indexTerms_spell', indexTermsAutoSuggest);

      // Create the author search and sort field.
      var authors = row.get('etl_authorList');
      row.remove('etl_authorList');

      // We concatenate authors into a single string. We could use a multiValue-field
      // to search authors but we prefer to keep our schema simple (one less field
      // type necessary). 
      var authorsConcat = '';
      if (authors != null) {
          for (var iA=0, lenA=authors.size(); iA<lenA; iA++) {
            if (authorsConcat !== '') {
              authorsConcat += '; ';
            }
            authorsConcat += String(authors.get(iA));
          }
      }
      row.put('authors_txt', authorsConcat);
      row.put('authors_txtsort', authorsConcat);
      appendToDocField(context, 'default_spell', authorsConcat);
      row.put('authors_spell', authorsConcat);
      row.put('authors_facet', authorsConcat);

      // Create the publication date sort field.
      // We could use a copy field instruction in the schema file
      // for this but we prefer to keep this code close to where all
      // other import transformations take place.
      var publicationDate = row.get('publicationDate_dt');
      if (publicationDate !== null) {
        row.put('publicationDate_dtsort', publicationDate);
      }

      return row;
    }

    /**
     * Custom transformer for full text.
     *
     * @param Map row
     * @param Context context
     * @returns Map The transformed row.
     */
    function transformFullText(row, context) {
      var entityName = context.getParentContext().getResolvedEntityAttribute('name');
      var locale = context.resolve(entityName + '.etl_file_locale');

      // Analyze unknown locales with a generic analyzer.
      if (locale == 'unknown') locale = 'txt';

      // Construct the dynamic field name.
      if (entityName == 'galley') {
        // Get the document format.
        var docType = resolveMimetype(String(row.get('Content-Type')));
        var fieldName = 'galleyFullText_' + docType + '_' + locale;
        var autoSuggestFieldName = 'galleyFullText_spell';
      }

      // Add full text.
      var currentText = String(row.get('text'));
      appendToDocField(context, fieldName, currentText);

      // Add full text to spellchecker and auto-suggest dictionaries.
      appendToDocField(context, 'default_spell', currentText);
      appendToDocField(context, autoSuggestFieldName, currentText);

      row.remove('text');
      return row;
    }

    /**
     * Resolve the mime type to the internal
     * document type.
     *
     * @param String mimetype
     * @returns String The internal document type
     */
    function resolveMimetype(mimetype) {
      // Remove mimetype options.
      mimetype = mimetype.replace(/;.*$/, '');

      // Determine the internal mime type.
      switch(mimetype) {
        case 'application/pdf':
        case 'application/x-pdf':
          mimetype = 'pdf';
          break;

        case 'text/html':
        case 'application/xhtml+xml':
          mimetype = 'html';
          break;

        case 'application/msword':
        case 'application/vnd.oasis.opendocument.text':
        case 'application/vnd.ms-word.document.macroenabled.12':
        case 'application/vnd.ms-word.template.macroenabled.12':
        case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template':
        case 'application/vnd.stardivision.writer':
        case 'application/x-staroffice-template':
        case 'application/vnd.sun.xml.writer':
        case 'application/x-vnd.sun.xml.writer':
        case 'application/vnd.sun.xml.writer.template':
          mimetype = 'doc';
          break;

        case 'text/plain':
          mimetype = 'plain';
          break;

        case 'application/epub+zip':
          mimetype = 'epub';
          break;

        case 'application/rtf':
        case 'text/rtf':
          mimetype = 'rtf';
          break;

        case 'application/xml':
        case 'text/xml':
          mimetype = 'xml';
          break;

        default:
          mimetype = 'other';
      }
      return mimetype;
    }

    /**
     * This function is a custom transformer that will
     * be called after all other transformers. We use it
     * to store document-level fields to the document.
     *
     * NB: This cannot be done in the other transformers
     * as only the first row.put() will have effect.
     *
     * @param Map row
     * @param Context context
     * @returns Map The transformed row.
     */
    function postProcessArticle(row, context) {
      var docFieldName, docFieldValue;
      var docFields = context.getSessionAttribute('docFields', 'document');

      // Add document-level fields to the document.
      for (var iD=0, lenD=docFields.length; iD<lenD; iD++) {
        docFieldName = docFields[iD];
        docFieldValue = context.getSessionAttribute(docFieldName, 'document');
        row.put(docFieldName, docFieldValue);
      }
      return row;
    }
  ]]></script>

  <document>
    <entity name="article"
            dataSource="http-post"
            processor="XPathEntityProcessor"
            forEach="/articleList/article"
            transformer="HTMLStripTransformer,script:transformArticle"
            stream="false"
            onError="continue" >

      <!-- ID fields -->
      <field column="submission_id" xpath="/articleList/article/@id" multiValued="false" />
      <field column="section_id" xpath="/articleList/article/@sectionId" multiValued="false" />
      <field column="journal_id" xpath="/articleList/article/@journalId" multiValued="false" />
      <field column="inst_id" xpath="/articleList/article/@instId" multiValued="false" />
      <field column="loadAction" xpath="/articleList/article/@loadAction" multiValued="false" />

      <!-- Authors -->
      <field column="etl_authorList" xpath="/articleList/article/authorList/author" multiValued="true" />

      <!-- Titles -->
      <field column="etl_titleList" xpath="/articleList/article/titleList/title" multiValued="true" />
      <field column="etl_titleList_locales" xpath="/articleList/article/titleList/title/@locale" multiValued="true" />
      <field column="etl_titleList_sortOnly" xpath="/articleList/article/titleList/title/@sortOnly" multiValued="true" />

      <!-- Abstracts -->
      <field column="etl_abstractList" xpath="/articleList/article/abstractList/abstract" multiValued="true" stripHTML="true" />
      <field column="etl_abstractList_locales" xpath="/articleList/article/abstractList/abstract/@locale" multiValued="true" />

      <!-- Disciplines -->
      <field column="etl_disciplineList" xpath="/articleList/article/disciplineList/discipline" multiValued="true" />
      <field column="etl_disciplineList_locales" xpath="/articleList/article/disciplineList/discipline/@locale" multiValued="true" />

      <!-- Subjects -->
      <field column="etl_subjectList" xpath="/articleList/article/subjectList/subject" multiValued="true" />
      <field column="etl_subjectList_locales" xpath="/articleList/article/subjectList/subject/@locale" multiValued="true" />

      <!-- Types -->
      <field column="etl_typeList" xpath="/articleList/article/typeList/type" multiValued="true" />
      <field column="etl_typeList_locales" xpath="/articleList/article/typeList/type/@locale" multiValued="true" />

      <!-- Coverage -->
      <field column="etl_coverageList" xpath="/articleList/article/coverageList/coverage" multiValued="true" />
      <field column="etl_coverageList_locales" xpath="/articleList/article/coverageList/coverage/@locale" multiValued="true" />

      <!-- Journal Titles -->
      <field column="etl_journalTitleList" xpath="/articleList/article/journalTitleList/journalTitle" multiValued="true" />
      <field column="etl_journalTitleList_locales" xpath="/articleList/article/journalTitleList/journalTitle/@locale" multiValued="true" />
      <field column="etl_journalTitleList_sortOnly" xpath="/articleList/article/journalTitleList/journalTitle/@sortOnly" multiValued="true" />

      <!-- Article Publication Date -->
      <field column="publicationDate_dt" xpath="/articleList/article/publicationDate" multiValued="false" />

      <!-- Issue Publication Date -->
      <field column="issuePublicationDate_dtsort" xpath="/articleList/article/issuePublicationDate" multiValued="false" />

      <!-- Galleys -->
      <field column="etl_galley_xml" xpath="/articleList/article/galley-xml" multiValued="false" />
      <entity name="galley"
              dataSource="field"
              dataField="article.etl_galley_xml"
              processor="XPathEntityProcessor"
              forEach="/galleyList/galley"
              stream="false"
              onError="continue" >

        <!-- File Data -->
        <field column="etl_file_name" xpath="/galleyList/galley/@fileName" multiValued="false" />
        <field column="etl_file_locale" xpath="/galleyList/galley/@locale" multiValued="false" />

        <!-- Full Text -->
        <entity name="galley_file"
                dataSource="file"
                processor="TikaEntityProcessor"
                url="${galley.etl_file_name}"
                transformer="script:transformFullText"
                format="text"
                onError="continue" >
          <field column="Content-Type" meta="true" />
          <field column="text" />
        </entity>
      </entity>

      <entity name="articlePostProcessing"
              dataSource="field"
              dataField="article.submission_id"
              processor="PlainTextEntityProcessor"
              transformer="script:postProcessArticle"
              onError="continue" >
      </entity>
    </entity>
  </document>
</dataConfig>
