indexer.php
<?
/*
* Simple indexer to stick all the Dr Who subtitles into Xapian
* Oh, how I do like Xapian
*
* Matthew Somerville, http://www.dracos.co.uk/
* Version 1.5, now I have all modern series
*/
$colors = array(
'#fefe00' => 'yellow',
'#00ffff' => 'cyan',
'#ededed' => 'white',
'#00ff00' => 'green',
'#ffffff' => 'white',
);
include '/usr/share/php/xapian.php';
$db = new XapianWritableDatabase('/home/matthew/xapian2', Xapian::DB_CREATE_OR_OPEN);
$indexer = new XapianTermGenerator();
$stemmer = new XapianStem("english");
$indexer->set_flags(128);
$indexer->set_database($db); # For spelling
$indexer->set_stemmer($stemmer);
# Read in files
# $files = glob("/home/matthew/DrWho/*/*.sub");
$files = glob("/home/matthew/DrWho/*4/13*.sub");
foreach ($files as $file) {
print "Processing $file...\n";
preg_match('#^/home/matthew/DrWho/Series(\d)/(\d*?)_(.*?)\.ts\.sub$#', $file, $m);
$series = $m[1];
$epid = $m[2] + 0;
$title = str_replace('_', ' ', $m[3]);
$file = file_get_contents($file);
# <p begin = "00:01:13.555" dur="00:00:05.00"><span tts:color="#fefe00" tts:textAlign="center"> I'm happy right now, thanks. </span><br/></p>
preg_match_all('/<p begin = "([^"]*)" dur="([^"]*)">(.*?)<\/p>/', $file, $rows, PREG_SET_ORDER);
$rowcount = 0;
foreach ($rows as $row) {
$begin = $row[1];
$beginN = substr($row[1], 0, 2) * 3600 + substr($row[1], 3, 2) * 60 + substr($row[1], 6);
$duration = $row[2];
if ($duration != '00:00:05.00') {
print "NEW duration: $duration";
exit;
}
# $text = preg_replace('#((<span[^>]*>).*?[^>])<br/>#', '$1</span><br/>$2', $row[3]);
$text = $row[3];
preg_match_all('/<span tts:color="([^"]*)" tts:textAlign="([^"]*)">(.*?)<\/span>/', $text, $spans, PREG_SET_ORDER);
foreach ($spans as $span) {
$color = $span[1];
if (!$colors[$color]) {
print "New color: $color\n";
exit;
}
$color = $colors[$color];
$align = $span[2];
$textarr = explode('<br/>', $span[3]);
$safetextarr = $textarr;
if (count($textarr) > count($lasttextarr)) {
$same = 1;
for ($c=0; $c<count($lasttextarr); $c++) {
if ($lasttextarr[$c] != $textarr[$c]) $same = 0;
}
if ($same) {
$textarr = array_slice($textarr, count($lasttextarr));
}
}
foreach ($textarr as $text) {
$text = trim($text);
if (!$text) continue;
$noise = 0;
if ($text != 'I' && !preg_match('#[a-z!.]#', $text) && $color=='white') {
$noise = 1;
}
$id = "$series-$epid-$rowcount";
$doc = new XapianDocument();
$indexer->set_document($doc);
$doc->set_data($text);
$doc->add_term("A$align");
$doc->add_term("B$begin");
$doc->add_term("C$color");
$doc->add_term("E$epid");
$doc->add_term("I$rowcount");
$doc->add_term("N$noise");
$doc->add_term("Q$id");
$doc->add_term("S$series");
$doc->add_value(0, Xapian::sortable_serialise($beginN));
$doc->add_value(1, sprintf("%d%02d", $series, $epid));
$indexer->index_text($text);
$db->add_document($doc);
$rowcount++;
}
$lasttextarr = $safetextarr;
}
}
}
$db = null;
index.php
<?
/*
* Front end for Dr Who subtitles searching, graphing, tagclouding
* Matthew Somerville, http://www.dracos.co.uk/
* Version 1.5. Written at Mashed08 and soon after, at some stupid time, so excuse poor code quality ;)
*/
include_once 'search.php';
# Construct query string from optional advanced search parameters
$query = isset($_GET['q']) ? $_GET['q'] : '';
if (preg_match('#date:(\d+)#', $query, $m)) {
list($s, $e) = date_index($m[1]);
if ($s && $e) {
$query = preg_replace('#date:\d+#', "series:$s ep:$e", $query);
header("Location: /~matthew/subtitles/?q=". urlencode($query));
exit;
}
}
if (isset($_GET['align']) && $_GET['align']) $query .= ' align:' . $_GET['align'];
if (isset($_GET['colour']) && $_GET['colour']) $query .= ' colour:' . strtolower($_GET['colour']);
if (isset($_GET['noise']) && $_GET['noise']) $query .= ' noise:' . $_GET['noise'];
if (isset($_GET['ep']) && $_GET['ep']) $query .= ' ep:' . $_GET['ep'];
if (isset($_GET['series']) && $_GET['series']) $query .= ' series:' . $_GET['series'];
if (isset($_GET['from']) && isset($_GET['to'])) {
$query .= ' ' . ($_GET[from]*60) . '..' . ($_GET[to]*60);
}
$query = trim($query);
$h_query = htmlspecialchars($query);
# Header
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en-gb">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title><? if ($query) echo "‘$h_query’ | "; ?>Who Said... Subtitle Search</title>
<link rel="stylesheet" type="text/css" href="style.css">
</head>
<body>
<div id="b">
<form action="./" method="get" id="search">
Search: <input type="text" name="q" value="<?=$h_query?>" size="20">
<input type="submit" value="Search">
</form>
<h1><a href="./">Who Said… <small>– Subtitle Search<?
if ($query) {
echo " for ‘$h_query’";
}
echo '</small></a></h1>';
if (isset($_GET['source'])) {
echo '<ul><li><a href="#indexer">indexer.php</a>
<li><a href="#index">index.php</a>
<li><a href="#searchphp">search.php</a>
</ul>';
echo '<h2><a name="indexer"></a>indexer.php</h2>';
highlight_file('indexer.php');
echo '<h2><a name="index"></a>index.php</h2>';
highlight_file('index.php');
echo '<h2><a name="searchphp"></a>search.php</h2>';
highlight_file('search.php');
} elseif ($query) {
# Let's do a search
$data = search($query, 1000000);
$estimate = $data['estimate'];
if ($estimate==0) {
print '<p id="error">No results, please try something else :)</p>';
front_page();
footer();
exit;
}
$data = $data['data'];
# First, collate stats from the data
$graph = array();
$max = 0;
$next = array(); $prev = array(); $eps = array();
foreach ($data as $row) {
$series = $row['series'];
$ep = $row['ep'];
$time = time_index($series, $ep, $row['begin']);
if ($time>1) continue;
$text = $row['text'];
if (preg_match('#' . $query . '[\s,.!?]+([\w\']+)#i', $text, $m)) $next[strtolower($m[1])]++;
if (preg_match('#([\w\']+)[\s,.!?]+' . $query. '#i', $text, $m)) $prev[strtolower($m[1])]++;
$eps["$series-$ep"]++;
$graph["$series-$ep"]++;
if ($max < $graph["$series-$ep"]) $max = $graph["$series-$ep"];
}
arsort($next); arsort($prev); arsort($eps);
# Output either a tag cloud if it's an episode, or a line graph
echo '<p align="center">';
if (preg_match('#series:(\d+)#', $query, $mmm) && preg_match('#ep:(\d+)#', $query, $mmmm)) {
$img = "images/$mmm[1]-$mmmm[1]L.png";
if (file_exists($img)) {
$size = getimagesize($img);
echo "<img width='$size[0]' height='$size[1]' alt='' src='$img'>";
}
} else {
echo '<img alt="" src="http://chart.apis.google.com/chart?chs=760x150&chds=0,' . $max . '&cht=ls&chd=t:';
for ($s=1; $s<=4; $s++) {
for ($e=1; $e<=14; $e++) {
if ($s==3 && $e==5) continue;
if ($s==4 && $e==14) break 2;
if ($s!=1 || $e>1) print ',';
echo ($graph["$s-$e"] ? $graph["$s-$e"] : 0);
}
}
echo '&chxt=y,x,x&chxl=0:|0|'.$max;
echo '|1:|1|2|3|4|5|6|7|8|9|10|11|12|13|X|1|2|3|4|5|6|7|8|9|10|11|12|13|X|1|2|3|4|6|7|8|9|10|11|12|13|X|1|2|3|4|5|6|7|8|9|10|11|12|13';
echo '|2:|||||||Series+1||||||||||||||Series+2||||||||||||||Series+3|||||||||||||Series+4||||||';
#echo '|1:|Runaway+Bride||Partners+in+Crime||Planet+of+the+Ood||Poison+Sky||Unicorn+and+the+Wasp||Forest+of+the+Dead||Turn+Left|';
#echo '2:||Voyage+of+the+Damned||Fires+of+Pompeii||Sontaran+Stratagem||Doctor\'s+Daughter||Silence+in+the+Library||Midnight|';
echo '&chxs=1,666666,10,0|2,666666,10,0&chg=0,0&chco=005aaa&chg=1.923,0'; # Last num from trial and error! Was 8.33 for just Series 4
echo '">';
}
echo '</p>';
# Right hand column of stats/form/links
echo '<div id="blurb">';
echo '<h2 style="margin-top:0">Stats</h2>';
if (!preg_match('#series:\d+#', $query) || !preg_match('#ep:\d+#', $query)) {
list ($word, $num) = each($eps);
print "<p>‘$h_query’ is mentioned most in the episode <strong>" . episode_lookup($word) . "</strong>, $num time" . ($num!=1?'s':'') . "</p>";
}
list ($word, $num) = each($next);
if ($word) print "<p>The most common word following ‘$h_query’ is <strong>$word</strong>, $num time" . ($num!=1?'s':'') . ".</p>";
list ($word, $num) = each($prev);
if ($word) print "<p>The most common word preceding ‘$h_query’ is <strong>$word</strong>, $num time" . ($num!=1?'s':'') . ".</p>";
?>
<form method="get">
<h3>Advanced Search</h3>
<p>Words: <input type="text" name="q" value="" size="20">
<br>Series: <select name="series"><option>1<option>2<option>3<option>4</select>
<br>Episode: <select name="ep">
<? for ($k=1; $k<=14; $k++) { print "<option>$k"; } ?>
</select>
<br>Alignment: <select name="align"><option value="">- Any -<option value="left">Left<option value="center">Centred<option value="right">Right</select>
<br>Colour: <select name="colour"><option value="">- Any -<option>White<option>Cyan<option>Yellow<option>Green</select>
<br><input id="form_noise" type="checkbox" name="noise" value="1"> <label for="form_noise">Stage direction?</label>
<br>Between <input type="text" size="2" name="from" value='<?=htmlspecialchars($_GET['from'])?>'>
– <input type="text" size="2" name="to" value='<?=htmlspecialchars($_GET['to'])?>'> minutes in
<br><input type="submit" value="Search">
<p><small>You can also use quoted phrases or use boolean logic.</small></p>
</form>
<h2>Complete episodes</h2>
<div id="eps_side">
<?
for ($s=1; $s<=4; $s++) {
echo "\n<h3>Series $s</h3> <ul>";
for ($e=1; $e<=14; $e++) {
if ($s==4 && $e==14) break;
$title = episode_lookup("$s-$e");
echo '<li>';
if ($s!=3 || $e!=5) echo "<a href='./?q=series:$s+ep:$e'>";
echo $title;
if (file_exists("images/$s-{$e}S.png"))
echo "<br><img alt='' src='images/$s-{$e}S.png'>";
if ($s!=3 || $e!=5) echo '</a>';
}
echo '</ul>';
}
?>
</div>
</div> <!-- Blurb -->
<h2>Results</h2>
<ul id='searchresults'>
<?
foreach ($data as $row) {
$text = $row['text'];
$text = preg_replace("#$query#i", '<span class="hi">$0</span>', $text);
$terms = $row['terms'];
$pretty_time = episode_lookup("$row[series]-$row[ep]") . ' ' . prettify_time($row['begin']);
$style = array();
# Might not need colour on both <li> and <a> - <a> definitely needed for IE, and might be enough for others?
if ($row['colour']=='yellow') $style[] = 'color:#ffff00';
elseif ($row['colour']=='cyan') $style[] = 'color:#00ffff';
elseif ($row['colour']=='green') $style[] = 'color:#00ff00';
if ($row['align']=='center') $style[] = 'text-align:center';
elseif ($row['align']=='right') $style[] = 'text-align:right';
if ($row['noise']=='1') $style[] = 'font-style:italic';
echo '<li';
if ($style) echo ' style="' . join(';', $style) . '"';
echo '><a name="' . $row['pos'] . '" href="./?q=series:' . $row['series'] . '+ep:' . $row['ep'] . '#' . $row['pos'] . '"';
$style = array();
if ($row['colour']=='yellow') $style[] = 'color:#ffff00';
elseif ($row['colour']=='cyan') $style[] = 'color:#00ffff';
elseif ($row['colour']=='green') $style[] = 'color:#00ff00';
if ($style) echo ' style="' . join(';', $style) . '"';
echo '>' . $text;
echo '<span class="t">' . $pretty_time . '</span></a>';
echo '</li>';
}
echo '</ul>';
} else {
front_page();
}
footer();
# ---
function footer() { ?>
<p id="footer">
Subtitle data from <a href="http://mashed08.backnetwork.com/event/?articleid=28">BBC Redux</a>,
loaded into a <a href="http://www.xapian.org/">Xapian</a> database,
graphs plotted with <a href="http://code.google.com/apis/chart/">Google Charts API</a>,
<br>and tag clouds drawn by <a href="http://wordle.net/">Wordle</a>.
Everything else (<a href="./?source=1">source</a>) by <a href="http://www.dracos.co.uk/">Matthew Somerville</a>, get in touch! :-)</p>
</div>
</body>
</html>
<?
}
# Utility functions
#function prettify_date($d) {
# return date('jS F', strtotime(substr($d,1,4).'-'.substr($d,5,2).'-'.substr($d,7,2)));
#}
function prettify_time($t) {
if (substr($t, 0, 2)=='00')
return substr($t, 3, 5);
return substr($t, 0, 8);
}
# Index for line graph generation
function date_index($d) {
if ($d=='20080402') return array(2,14);
if ($d=='20080403') return array(3,14);
if ($d=='20080405') return array(4,1);
if ($d=='20080412') return array(4,2);
if ($d=='20080419') return array(4,3);
if ($d=='20080426') return array(4,4);
if ($d=='20080503') return array(4,5);
if ($d=='20080510') return array(4,6);
if ($d=='20080517') return array(4,7);
if ($d=='20080531') return array(4,8);
if ($d=='20080607') return array(4,9);
if ($d=='20080614') return array(4,10);
if ($d=='20080621') return array(4,11);
return array(0,0);
}
# Times from manually looking
function time_index($s, $e, $t) {
$t = substr($t, 0, 2) * 3600 + substr($t, 3, 2) * 60 + substr($t, 6);
if ($s==2 && $e==14) return $t / 60/60;
if ($s==3 && $e==14) return $t / 75/60;
if ($s==4 && ($e==1 || $e==2)) return $t / 48/60;
if ($s==4 && ($e==7 || $e==10)) return $t / 44/60;
if ($s==4 && $e==11) return $t / 49/60;
if ($s==2 && $e==9) return $t / 49/60;
if ($s==2 && $e==13) return $t / 46/60;
if ($s==3 && $e==11) return $t / 46/60;
if ($s==3 && $e==12) return $t / 46/60;
if ($s==3 && $e==13) return $t / 51/60;
if ($s==4 && $e==13) return $t / 63/60;
return $t / 45/60;
}
# For most common use display
function episode_lookup($n) {
$eps = array(
'1-1' => 'Rose',
'1-2' => 'The End of the World',
'1-3' => 'The Unquiet Dead',
'1-4' => 'Aliens of London',
'1-5' => 'World War Three',
'1-6' => 'Dalek',
'1-7' => 'The Long Game',
'1-8' => 'Father’s Day',
'1-9' => 'The Empty Child',
'1-10' => 'The Doctor Dances',
'1-11' => 'Boom Town',
'1-12' => 'Bad Wolf',
'1-13' => 'The Parting of the Ways',
'1-14' => 'Christmas Invasion',
'2-1' => 'New Earth',
'2-2' => 'Tooth and Claw',
'2-3' => 'School Reunion',
'2-4' => 'The Girl in the Fireplace',
'2-5' => 'Rise of the Cybermen',
'2-6' => 'The Age of Steel',
'2-7' => 'The Idiot’s Lantern',
'2-8' => 'The Impossible Planet',
'2-9' => 'The Satan Pit',
'2-10' => 'Love and Monsters',
'2-11' => 'Fear Her',
'2-12' => 'Army of Ghosts',
'2-13' => 'Doomsday',
'2-14' => 'Runaway Bride',
'3-1' => 'Smith and Jones',
'3-2' => 'The Shakespeare Code',
'3-3' => 'Gridlock',
'3-4' => 'Daleks in Manhattan',
'3-5' => 'Evolution of the Daleks',
'3-6' => 'The Lazarus Experiment',
'3-7' => '42',
'3-8' => 'Human Nature',
'3-9' => 'The Family of Blood',
'3-10' => 'Blink',
'3-11' => 'Utopia',
'3-12' => 'The Sound of Drums',
'3-13' => 'Last of the Time Lords',
'3-14' => 'Voyage of the Damned',
'4-1' => 'Partners in Crime',
'4-3' => 'Planet of the Ood',
'4-5' => 'The Poison Sky',
'4-7' => 'The Unicorn and the Wasp',
'4-9' => 'Forest of the Dead',
'4-11' => 'Turn Left',
'4-2' => 'The Fires of Pompeii',
'4-4' => 'The Sontaran Stratagem',
'4-6' => 'The Doctor’s Daughter',
'4-8' => 'Silence in the Library',
'4-10' => 'Midnight',
'4-12' => 'The Stolen Earth',
'4-13' => "Journey's End",
);
return $eps[$n];
}
function front_page() { ?>
<div id="examples">
Some example searches:
<a href="./?q=rose">Rose</a> /
<a href="./?q=martha">Martha</a> /
<a href="./?q=donna">Donna</a>,
<a href="./?q=tardis">TARDIS</a>,
<a href="./?q=tyler+-rose">Tyler -Rose</a>,
<a href="./?q=sonic">sonic</a>,
<a href="./?q='very clever'">"very clever"</a>,
<a href="./?noise=1"><i>all stage directions</i></a>,
<a href="./?q=doctor+colour:cyan"><i>all cyan subtitles mentioning doctor</i></a>.
</div>
<p>I created this small project as part of BBC Mashed 2008. The database contains all the subtitles the BBC’s lovely program could
parse from the modern Doctor Who, so results will obviously be wrong for the <a href="#missing">missing
areas</a>.</p>
<p>You can search by word, phrase, stage direction-ness, subtitle colour or
position, series, episode, or time within episode. Each episode has a representative tag cloud, and search results have
a line graph showing usage throughout the series. All subtitles on search
results are clickable to go to that point in the full episode list of subtitles.
</p>
<ul id="front_eps">
<?
for ($s=1; $s<=4; $s++) {
for ($e=1; $e<=14; $e++) {
if ($s==4 && $e==14) break;
if ($s==3 && $e==5) continue;
echo '<li>';
if ($s!=3 || $e!=5) echo '<a href="./?q=series:', $s, '+ep:', $e, '">';
if (file_exists("images/$s-{$e}S.png"))
echo '<img alt="" src="images/', $s, '-', $e, 'S.png"><br>';
echo episode_lookup("$s-$e");
if ($s!=3 || $e!=5) echo '</a>';
}
}
?>
</ul>
<p style="clear:both"><a name="missing"></a>Here’s the precise details of what I have:</p>
<ul>
<li>Series 1: All of episodes 1, 4, 5, 7, 9, and 13; almost all of episode 10; around 30 minutes of episodes 3 and 12; around 20 minutes of episodes 2 and 6; 13 minutes of episodes 8 and 11.
<li>20 minutes of Christmas Invasion
<li>Series 2: All of episodes 3, 4, 9, 10, 13; around 35 minutes of episodes 6, 7, and 8; around 20–25 minutes of episodes 5 and 11; and around 15 minutes for episodes 1, 2, and 12.
<li>All of The Runaway Bride
<li>Series 3: All of episodes 1, 3, 6, 7, 9, 10, 11, 12, 13; around half an hour of episodes 2 and 8; 13 minutes of episode 4; none of episode 5.
<li>13.5 minutes of Voyage of the Damned
<li>Series 4: All of episodes 1, 2, 4, 7, 10, 11, 12, and 13; 37.5 minutes of episode 9; only 10–13 minutes of episodes 3, 5, 6, and 8.
</ul>
<?
}
search.php
<?
/*
* Simple query parser to search our Dr Who subtitles
*
* Matthew Somerville, http://www.dracos.co.uk/
* Version 1.5
*/
include '/usr/share/php/xapian.php';
function search($query, $num = 20) {
$db = new XapianDatabase('/home/matthew/xapian2');
$enquire = new XapianEnquire($db);
$stemmer = new XapianStem("english");
$qp = new XapianQueryParser();
$valuerange = new XapianNumberValueRangeProcessor(0);
$qp->set_stemmer($stemmer);
$qp->set_database($db);
$qp->set_stemming_strategy(XapianQueryParser::STEM_SOME);
$qp->set_default_op(Query_OP_AND);
$qp->add_boolean_prefix('align', 'A');
$qp->add_boolean_prefix('colour', 'C');
$qp->add_boolean_prefix('ep', 'E');
$qp->add_boolean_prefix('noise', 'N');
$qp->add_boolean_prefix('series', 'S');
$qp->add_valuerangeprocessor($valuerange);
$query = $qp->parse_query($query, XapianQueryParser::FLAG_BOOLEAN | XapianQueryParser::FLAG_PHRASE |
XapianQueryParser::FLAG_LOVEHATE | XapianQueryParser::FLAG_WILDCARD |
XapianQueryParser::FLAG_SPELLING_CORRECTION);
$enquire->set_query($query);
$enquire->set_sort_by_value(1, true);
$matches = $enquire->get_mset(0, $num);
$desc = $query->get_description();
$estimate = $matches->get_matches_estimated();
$out = array();
$iter = $matches->begin();
while (!$iter->equals($matches->end())) {
$doc = $iter->get_document();
$data = array('text' => $doc->get_data());
$rank = $iter->get_rank() + 1;
$termiter = $doc->termlist_begin();
$terms = array();
while (!$termiter->equals($doc->termlist_end())) {
$term = $termiter->get_term();
$prefix = substr($term, 0, 1);
if ($prefix == 'A') {
$data['align'] = substr($term, 1);
} elseif ($prefix == 'B') {
$data['begin'] = substr($term, 1);
} elseif ($prefix == 'C') {
$data['colour'] = substr($term, 1);
} elseif ($prefix == 'E') {
$data['ep'] = substr($term, 1);
} elseif ($prefix == 'N') {
$data['noise'] = substr($term, 1);
} elseif ($prefix == 'I') {
$data['pos'] = $term;
} elseif ($prefix == 'S') {
$data['series'] = substr($term, 1);
} else {
$data['terms'][] = $term;
}
$termiter->next();
}
$out[] = $data;
$iter->next();
}
$db = null;
return array(
'query' => $desc,
'estimate' => $estimate,
'data' => $out,
);
}