#!/usr/bin/php4 -q
<?php
/* ******************************************************************** */
/* CATALYST PHP Source Code                                             */
/* -------------------------------------------------------------------- */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2 of the License, or    */
/* (at your option) any later version.                                  */
/*                                                                      */
/* This program is distributed in the hope that it will be useful,      */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of       */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        */
/* GNU General Public License for more details.                         */
/*                                                                      */
/* You should have received a copy of the GNU General Public License    */
/* along with this program; if not, write to:                           */
/*   The Free Software Foundation, Inc., 59 Temple Place, Suite 330,    */
/*   Boston, MA  02111-1307  USA                                        */
/* -------------------------------------------------------------------- */
/*                                                                      */
/* Filename:    lu-index-tree.php                                       */
/* Author:      Paul Waite                                              */
/* Description: Wrapper for indexing all HTML stories in a directory    */
/*              tree on disk, to Lucene. We expect the path of the      */
/*              directory to be passed on the command line. All HTML    */
/*              files on this path will be recursively indexed.         */
/*                                                                      */
/*              NB: This is intended to be used in a manual mode of     */
/*              intervention, to index a whole archive of stories.      */
/*                                                                      */
/*       usage: ./lu-index-tree.php [-v] <path> [restart]               */
/*                                                                      */
/*              Optional parm "-v" instigates verbose mode.             */
/*              Optional parm "restart" is a literal. If "restart" is   */
/*              present, then it is assumed that <path> is the path     */
/*              to a file containing the paths to index. Ie. a file     */
/*              which might have been generated by a 'find' command.    */
/*                                                                      */
/* ******************************************************************** */
// CLI Program working dir is always its own directory
// so we need to get into the website root directory..
chdir("..");

include_once("application.php");
include_once("$LIBDIR/lucene-defs.php");


// Only do anything if there are args..
if ($argc > 1) {
  $storyarchive = $argv[1];
  $restart = "";

  // Check for verbose mode of operation..
  if ($storyarchive == "-v") {
    debug_on(DBG_DEBUG);
    debug_output(DBG_O_CLI);
    $storyarchive = $argv[2];
    if ($argc == 4) {
      $restart = $argv[3];
    }
  }
  elseif ($argc == 3) {
    $restart = $argv[2];
  }

  if (file_exists($storyarchive)) {
    if ($restart != "restart") {
      debug("pre-optimizing index..", DBG_DEBUG);
      $optmsg = new lucene_controlmsg("OPTIMIZE");
      $optmsg->send(SOCK_FOREVER);
      debugbr("done.", DBG_DEBUG);
    }
    $indexer = new lucene_fileindexer();
    $indexer->id_generate(ID_FROM_NAME);
    $indexer->scantags();

    // Note: for Newsquest the following fields should all be
    // defined in the application properties file, so we don't
    // need to use define_field() here. Instead we just make sure
    // they get processed as meta tags from the files..
    $indexer->meta_field("Id", "Id");
    $indexer->meta_field("title", "Text");
    $indexer->meta_field("slug", "Text");
    $indexer->meta_field("date", "Date");
    $indexer->meta_field("added", "Date");
    $indexer->meta_field("category", "Text");
    $indexer->meta_field("subcategory", "Text");
    $indexer->meta_field("type", "Text");
    $indexer->meta_field("source", "Text");
    $indexer->meta_field("sourcetype", "Text");

    // Index it all..
    $indexer->index_tree($storyarchive, "*.html", $restart);

    debugbr("indexing completed.", DBG_DEBUG);

    // Optimize the index after the batch has completed..
    debug("optimizing index..", DBG_DEBUG);
    lucene_optimize();
    debugbr("done.", DBG_DEBUG);
  }
  else {
    debugbr("story archive not found '$storyarchive'", DBG_DEBUG);
  }
}
?>
