#!/usr/local/bin/php
<?
$uniserver='www.unicode.org'; // the unicode server's hostname
$allfile1="allchars1.php"; // the "All Characters by Codepoint" file
$allfile2="allchars2.php"; // the "All Characters by Block" file
$prefix='data'; // the dir, relative to this script's "pwd",
    // to read/write the data files from/to (if you start it with a slash,
    // it won't be relative anymore, it'll be absolute, but it'll work.)

//////// End of "settings"....

require_once("functions.phps");
$START=GMT();
if (! is_dir($prefix)) mkdir($prefix, 0755);

//$codelistfile="$prefix/SHIFTJIS.TXT";
//$codelisturl="http://$uniserver/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT";
// Found a better one, and it even says "distribute as you wish":
$codelistfile="$prefix/sjis-0213-2004-std.txt";
$codelisturl="http://x0213.org/codetable/sjis-0213-2004-std.txt";
$namesfile="$prefix/NamesList.txt";
$namesurl="http://$uniserver/Public/UNIDATA/NamesList.txt";
// KILLED and replaced with something called "UAX 38-Unicode Han Database"... shrug.
// YES! A friggin' ZIP!! which we gotta UN-friggin-zip!!!!@$#%^!$#@^
$defsfileZ="$prefix/Unihan.zip";
$defsfile="$prefix/Unihan_Readings.txt";
$defsurlZ="http://$uniserver/Public/UNIDATA/Unihan.zip";
$blocksfile="$prefix/Blocks.txt";
$blocksurl="http://$uniserver/Public/UNIDATA/Blocks.txt";
$combiningfile="$prefix/UnicodeData.txt";
$combiningurl="http://$uniserver/Public/UNIDATA/UnicodeData.txt";

if ($argv[1]) {
    echo "Data files:\n$codelisturl\n$namesurl\n$defsurlZ\n$blocksurl\n$combiningurl\n";
    die;
}

// Just make this blank or set it to WhatEverYouWant. It's for wget; sets the "user agent."
// Be sure to leave a space at the end if it's not totally-blank!
$params='-U "The UniSearcher (http://www.isthisthingon.org/unicode/)" ';

$msg="";
if (!file_exists($defsfileZ)) { // Get the big'n first...
  exec("wget -O $defsfileZ $params$defsurlZ", $crap, $ret);
  if ($ret) { $msg .= "  I couldn't download \"$defsurlZ\"\n"; }
}
if (!file_exists($codelistfile)) {
  exec("wget -O $codelistfile $params$codelisturl", $crap, $ret);
  if ($ret) { $msg .= "  I couldn't download \"$codelisturl\"\n"; }
}
if (!file_exists($namesfile)) {
  exec("wget -O $namesfile $params$namesurl", $crap, $ret);
  if ($ret) { $msg .= "  I couldn't download \"$namesurl\"\n"; }
}
if (!file_exists($blocksfile)) {
  exec("wget -O $blocksfile $params$blocksurl", $crap, $ret);
  if ($ret) { $msg .= "  I couldn't download \"$blocksurl\"\n"; }
}
if (!file_exists($combiningfile)) {
  exec("wget -O $combiningfile $params$combiningurl", $crap, $ret);
  if ($ret) { $msg .= "  I couldn't download \"$combiningurl\"\n"; }
}

if ($msg!="") {
  echo "Sorry, the parser cannot run for the following reason(s):\n$msg" .
    "Download the missing file(s) into the \"$prefix/\" directory and then try again.\n" .
    "Cut and paste as needed:\ncd $prefix\n" .
    "wget '$codelisturl'\nwget '$namesurl'\nwget '$defsurlZ'\nwget '$blocksurl'\n";
  die;
}

$allcodes=$alldescs=$alldefs=$allprons=$allblocks=$allcombiners=array();
$proncnt=$cpcnt=0;

echo "All required data files present and accounted for.\n";
if (file_exists($defsfileZ) && !file_exists($defsfile)) {
    "Unzipping Unihan.zip... ";
    $zo=zip_open($defsfileZ);
    if ($zo==FALSE) {
        echo "failed!\nCouldn't open \"$defsfileZ\". Try something! Fix it!!\n";
        die;
    }
    while ($zfile=zip_read($zo)) {
        $zname=zip_entry_name($zfile);
        if ($zname != 'Unihan_Readings.txt') continue;
        if (!zip_entry_open($zo, $zfile, "r")) break;
        $fd=fopen($defsfile, 'w');
        if ($fd==FALSE) break;
        fwrite($fd, zip_entry_read($zfile, zip_entry_filesize($zfile)));
        fclose($fd);
        zip_entry_close($zfile);
    }
    if (!file_exists($defsfile)) {
        echo "failed!\n\"$defsfile\" not found after unzipping!\n";
        die;
    }
}

echo "\nDone unzipping.\n" .
  "Parsing data; please wait... should only take a couple of minutes at most.\n" .
  "\n$namesfile\n\tReading... ";
flush();

$tmp=file($namesfile);
$tmpcnt=count($tmp);

echo "done.\n\tParsing... "; flush();

$z=0;
foreach ($tmp as $line) {    //  NAMES
  progressbar(++$z, $tmpcnt);
  if (preg_match('/^[^abcdef\d]+/i', $line)) continue;
  $tmp2=preg_split('/\s+/', $line, 2);
  $hex=hexpad($tmp2[0]);
  $l=strlen($hex);
  if ($l!=4 && $l!=5) continue;
  $d=ucwords(strtolower(trim($tmp2[1])));
  if ($d=="<not a character>") continue;
  if ($d=="<cjk>") { $d="Kanji Character"; }
  if ($d=="<control>") { $d="Control Character"; }
  if ($d=="<reserved>") { $d="Reserved Character"; }
  $alldescs[$hex]=SQuote($d);
}

echo "\n\tProcessed " . number_format(filesize($namesfile)) . " bytes in " .
  number_format($tmpcnt) . " lines.\n" . "$codelistfile\n\tReading... ";
flush();

$tmp=file($codelistfile);
$tmpcnt=count($tmp);

echo "done.\n\tParsing... "; flush();

$z=0;
foreach ($tmp as $line) {
  progressbar(++$z, $tmpcnt);
  if (strtolower(substr($line, 0, 2)) != "0x") continue;
  $a=preg_split('/\s+/', $line, 4);
  if (strlen($a[1]) > 7) continue; // some are like "U+0254+0300" which is a combining thing.
  $idx=hexpad(strtoupper(substr(trim($a[1]), 2))); // codepoint; will be up to U+2A6B2
  $jis=hexpad(strtoupper(substr(trim($a[0]), 2))); // sjis code; will be up to 0xFCF4
  $allcodes[$idx]=$jis;
  if (trim($alldescs[$idx])!="") continue;
  $tmp=htmlspecialchars(ucwords(strtolower(trim(preg_replace('/\[.+/', '', $a[3])))));
  if (preg_match('/&lt;not a character&gt;/i', $tmp)) continue;
  if (preg_match('/&lt;cjk&gt;/i', $tmp)) $tmp="Kanji Character";
  if (preg_match('/&lt;control&gt;/i', $tmp)) $tmp="Control Character";
  $alldescs[$idx]=SQuote($tmp);
}

echo "\n\tProcessed " . number_format(filesize($codelistfile)) . " bytes in " .
  number_format($tmpcnt) . " lines.\n" . "$defsfile\n\tReading... "; flush();
$tmp=file($defsfile);
$tmpcnt=count($tmp);
echo "done.\n\tParsing... "; flush();
$z=0;
foreach ($tmp as $line) {
  progressbar(++$z, $tmpcnt);
  if (substr($line, 0, 0)=="#") continue;
  $tmp=preg_split('/\s+/', $line, 3);
  $l=strlen($tmp[0]);
  if (substr($tmp[0], 0, 2)!="U+" || ($l!=6 && $l!=7)) continue;
  $uni=substr($tmp[0], 2);
  if (!isset($allprons[$uni])) $allprons[$uni]=array();
  // There's lots of useful lines for each codepoint. Most we can ignore, but THESE I want:
  switch($tmp[1]) {
  case "kDefinition":
    $alldefs[$uni]=SQuote(htmlspecialchars(ucfirst(strtolower(trim($tmp[2])))));
    break;
  case "kJapaneseKun":
    $allprons[$uni]['Japanese Kun'].=SQuote(htmlspecialchars(ucwords(strtolower(preg_replace('/\s+/', ', ', trim($tmp[2]))))));
    break;
  case "kJapaneseOn":
    $allprons[$uni]['Japanese On'].=SQuote(htmlspecialchars(ucwords(strtolower(preg_replace('/\s+/', ', ', trim($tmp[2]))))));
    break;
  case "kHanyuPinlu":
    $regs2=array();
    preg_match_all('/(\w+?)(\d*)\s*\(\d+\)/', trim($tmp[2]), $regs2, PREG_SET_ORDER);
    $parry=array();
    foreach ($regs2 as $rm) {
      $tmpparry=$rm[1];
      if ($rm[2]) $tmpparry .= "-{$rm[2]}";
      $parry[]=$tmpparry;
    }
    $allprons[$uni]['Pinyin'].=SQuote(htmlspecialchars(ucwords(strtolower(join(', ', $parry)))));
    break;
  case "kCantonese":
    $regs2=array();
    preg_match_all('/(\w+?)(\d+)\s*/', trim($tmp[2]), $regs2, PREG_SET_ORDER);
//    $pinyin="{$regs2[1]}-{$regs2[2]}";
    $parry=array();
    foreach ($regs2 as $rm) {
      $tmpparry=$rm[1];
      if ($rm[2]) $tmpparry .= "-{$rm[2]}";
      $parry[]=$tmpparry;
    }
    $allprons[$uni]['Cantonese'].=SQuote(htmlspecialchars(ucwords(strtolower(join(', ', $parry)))));
//    $allprons[$uni]['Cantonese'].=SQuote(htmlspecialchars(ucwords(strtolower(preg_replace('/\s+/', ', ', trim($tmp[2]))))));
    break;
  case "kVietnamese":
    $allprons[$uni]['Vietnamese'].=SQuote(htmlspecialchars(ucwords(strtolower(preg_replace('/\s+/', ', ', trim($tmp[2]))))));
    break;
  case "kKorean":
    $allprons[$uni]['Korean'].=SQuote(htmlspecialchars(ucwords(strtolower(preg_replace('/\s+/', ', ', trim($tmp[2]))))));
    break;
  default:
    continue;
    break;
  }
}

echo "\n\tProcessed " . number_format(filesize($defsfile)) . " bytes in " . number_format($tmpcnt) . " lines.\n" .
  "$blocksfile\n\tReading... "; flush();
$tmp=file($blocksfile);
$tmpcnt=count($tmp);
echo "done\n\tParsing... "; flush();
$z=0;
foreach ($tmp as $line) {
  progressbar(++$z, $tmpcnt);
  if (!preg_match('/^([\d\w]+)\.\.([\d\w]+);\s+(.+)$/', $line, $regs)) continue;
  $l1=strlen($regs[1]);
  $l2=strlen($regs[2]);
  if (($l1!=4 && $l1!=5) || ($l2!=4 && $l2!=5)) continue;
  $allblocks[]=array(hexdec(trim($regs[1])), hexdec(trim($regs[2])), SQuote(htmlspecialchars(trim($regs[3]))));
}

echo "\n\tProcessed " . number_format(filesize($blocksfile)) . " bytes in " . number_format($tmpcnt) . " lines.\n" .
  "$combiningfile\n\tReading... ";

$tmp=file($combiningfile);
$tmpcnt=count($tmp);
echo "done\n\tParsing... "; flush();
$z=0;
/*
 * Sample line from UnicodeData.txt ($combiningfile):
 * 0335;COMBINING SHORT STROKE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING SHORT BAR OVERLAY;;;;
 * Field number 4 [3] will be > 0 if that character is a "combining form." Those are what we're looking for.
 * Use field 1 [0] as the hex codepoint for that char.
 * This file contains no comments and no commented-out lines. EVERY line is valid.
 * It's also valuable for including chars with no description, definition, shiftjis, or pronunciation info;
 * those, I discovered, were totally missing from the DB before.
*/
foreach ($tmp as $line) {
  progressbar(++$z, $tmpcnt);
  $tmpsplit=preg_split('/;/', $line);
  $hex=hexpad(trim($tmpsplit[0]));
  $dec=hexdec($hex);
  // Add $alldescs[$hex] with 'no description' if not set by now.
  if ($alldescs[$hex]=='') $alldescs[$hex]='<I>No Description</I>';
  // Add entry for an "official" combining form.
  if ($tmpsplit[3]=='' || (int)$tmpsplit[3]==0) continue;
  $allcombiners[$dec]=(int)$tmpsplit[3];
}


// Final processing; parsing's all done now.

echo "\n\tProcessed " . number_format(filesize($combiningfile)) . " bytes in " . number_format($tmpcnt) . " lines.\n" .
  "\nPreparing \"codepoint\" SQL insert:\n\t"; flush();

//$cnt=count($allcodes) + count($alldescs) + count($alldefs) + count($allcombiners);
$cnt=count($allcodes) + count($alldescs) + count($alldefs);

foreach($allprons as $P) {
  $proncnt += count($P);
  $cnt += count($P);
}

$z=0;
$queries=array();
foreach(array_keys($allcodes) as $k) {
  progressbar(++$z, $cnt);
  $queries[strtoupper(hexpad($k))]["sjis"]=$allcodes[$k];
}
foreach(array_keys($alldescs) as $k) {
  progressbar(++$z, $cnt);
  $queries[strtoupper(hexpad($k))]["descr"]=$alldescs[$k];
}
foreach($allprons as $k=>$P) {
  progressbar(++$z, $cnt);
  $tmp='';
  foreach ($P as $ptype=>$ptext) {
    $tmp.="<I>$ptype:</I> $ptext<BR>\n";
  }
  if ($tmp) $tmp=substr($tmp, 0, -5);
  $queries[strtoupper(hexpad($k))]["pron"]=$tmp;
}
foreach(array_keys($alldefs) as $k) {
  progressbar(++$z, $cnt);
  $queries[strtoupper(hexpad($k))]["def"]=$alldefs[$k];
}
// Add some to the 'combining form' characters, inasmuch as it's possible to FIND THEM...
foreach ($alldescs as $k=>$v) {
  if (!preg_match('/\b(combining|length\s+mark|vowel\s+sign|semivowel\s+sign)\b/i', $v)) continue;
  if (!isset($allcombiners[hexdec($k)])) $allcombiners[hexdec($k)]=-1;
}

echo "\nExecuting \"codepoint\" SQL insert:\n\t"; flush();
dosql("delete from codepoint");
$sq="insert into codepoint values ";
$lenchk=strlen($sq);
$z=0;
$cnt=count($queries);
// To force it to execute every query individually (ONLY recommended for debugging), uncomment this:
//$DbPacketMax=0;
foreach (array_keys($queries) as $k) {
  ++$cpcnt;
  progressbar(++$z, $cnt);
  $kint=hexdec($k);
  $tmp="($kint, '$k','{$queries[$k]["sjis"]}','{$queries[$k]["descr"]}','{$queries[$k]["pron"]}'," .
    "'{$queries[$k]["def"]}', " . ($allcombiners[$kint] ? $allcombiners[$kint] : '0') . "),";
  if (strlen($sq)+strlen($tmp) > $DbPacketMax) {
    $sq=substr($sq, 0, -1);
    dosql($sq);
    $sq="insert into codepoint values ";
  }
  $sq.=$tmp;
}
if (strlen($sq) > $lenchk) {
  $sq=substr($sq, 0, -1);
  dosql($sq);
}

echo "\nExecuting \"block\" SQL insert:\n\t"; flush();
dosql("delete from block");
$sq="insert into block values ";
$lenchk=strlen($sq);
$z=0;
$cnt=count($allblocks);
foreach($allblocks as $b) {
  progressbar(++$z, $cnt);
  $tmp="('{$b[0]}', '{$b[1]}', '{$b[2]}'),";
  if (strlen($sq)+strlen($tmp) > $DbPacketMax) {
    $sq=substr($sq, 0, -1);
    dosql($sq);
    $sq="insert into block values ";
  }
  $sq.=$tmp;
}
if (strlen($sq) > $lenchk) {
  $sq=substr($sq, 0, -1);
  dosql($sq);
}

if (trim($argv[1])=='') {
  echo "\n\nRecreating \"$allfile1\"... "; flush();
  exec("./allchars1 > $allfile1", $ret, $crap);
  echo "done.\nRecreating \"$allfile2\"... "; flush();
  exec("./allchars2 > $allfile2", $ret, $crap);
}

$ELAPSED=round(GMT()-$START, 2);

echo "done.\n\nAll files imported successfully in $ELAPSED seconds. Statistics:\n\t" .
  number_format($cpcnt) . " Total Characters\n\t" .
  number_format(count($alldescs)) . " Character descriptions\n\t" .
  number_format(count($allcodes)) . " ShiftJIS-to-Unicode mappings\n\t" .
  number_format(count($alldefs)) . " English definitions\n\t" .
  number_format($proncnt) . " Various pronunciations\n\t" .
  number_format(count($allblocks)) . " Character grouping blocks\n\t" .
  number_format(count($allcombiners)) . " Combining Characters\n\n" .
  "UniSearch is now ready for use.\n";
?>