",a(b).prepend(k);var l={settingsdialog:a("#"+c+" .poodll_dialogue_box_settings"),downloaddialog:a("#"+c+" .poodll_dialogue_box_download"),errorsdialog:a("#"+c+" .poodll_dialogue_box_errors"),settingsicon:a("#"+c+" .settingsicon"),status:a("#"+c+" .poodll_status_push"),preview:a("#"+c+" .poodll_preview_push"),bigbutton:a("#"+c+".poodll_mediarecorderbox_push"),playcanvas:a("#"+c+"_playcanvas"),thecaption:a("#"+c+"_caption"),themicicon:a("#"+c+"_micicon"),stopbutton:a("#"+c+" .poodll_mediarecorder_bogusstopbutton_push"),startbutton:a("#"+c+" .poodll_mediarecorder_bogusstartbutton_push")};return g.downloaddialog.set_dialogue_box(l.downloaddialog),g.errordialog.set_dialogue_box(l.errorsdialog),this.devsettings.set_dialogue_box(l.settingsdialog),l},register_controlbar_events_video:function(a,b){return this.register_controlbar_events_audio(a,b)},register_controlbar_events_audio:function(b,d){var e=this,f=this.pmr,h=this.fetch_instanceprops();h.config.recanim="hwave_mic";var i=g.clone();e.therecanim=i,i.init(h.audioanalyser,h.controlbar.playcanvas.get(0)),this.set_visual_mode("startbuttonready"),h.controlbar.bigbutton.click(function(g){c.debug(g.target);var i=!1;if((g.target===e||a(g.target).hasClass("style-holder")||a(g.target).hasClass("poodll_mediarecorderbox_push")||a(g.target).hasClass("poodll_mediarecorder_caption_push")||a(g.target).hasClass("poodll_mediarecorder_playcanvas_push"))&&(i=!0),i)switch(e.buttonmode){case"startbuttonready":var j={};j.type="recorderstatus",j.status="startbuttonrecording",h.config.hermes.postMessage(j),h.timer.disable(),f.do_start_audio(h,b),e.set_visual_mode("startbuttonrecording");break;case"startbuttonrecording":e.just_stop(d);break;case"oldstartbuttonready":var k=function(){var a=(new Date).getTime(),c=a-l;if(c>m)h.timer.enable(),f.do_start_audio(h,b);else{var d=!1;if(n<0&&c>0?d=m/1e3:n<1e3&&c>1e3?d=m/1e3-1:n<2e3&&c>2e3&&(d=m/1e3-2),d){var e={};e.type="countdownstatus",e.status=d,h.config.hermes.postMessage(e)}n=c,setTimeout(k,100)}};h.config.hermes.enable(),e.set_visual_mode("startbuttoncountdown");var l=(new Date).getTime(),m=3e3,n=-1;setTimeout(k,100);break;case"stopbutton":e.stop_and_upload()}}),h.controlbar.settingsicon.click(function(a){c.debug("we no proapagato"),a.stopPropagation(),e.uploaded?h.downloaddialog.open():e.devsettings.open()}),window.onbeforeunload=function(){}},just_stop:function(){var a=this.pmr,b=this.fetch_instanceprops(),c=this.therecanim;b.mediaRecorder&&a.do_stop_audio(b),c.clear(),b.config.hermes.enable(),this.set_visual_mode("startbuttonready")},stop_and_upload:function(a){var b=this,c=this.pmr,d=this.fetch_instanceprops(),e=b.therecanim;c.do_stop_audio(d),e.clear(),d.timer.stop(),b.update_status(a);var f=function(){d.blobs&&d.blobs.length>0?(c.do_save_audio(d),d.uploaded=!0):setTimeout(f,200)};setTimeout(f,200),b.set_visual_mode("uploading")},enable_button:function(b){a(b).attr("disabled",!1),a(b).removeClass("pmr_disabled")},disable_button:function(b){a(b).attr("disabled",!0),a(b).addClass("pmr_disabled")}}});
\ No newline at end of file
diff --git a/amd/build/speech_awstranscribe.min.js b/amd/build/speech_awstranscribe.min.js
deleted file mode 100644
index 36f70041..00000000
--- a/amd/build/speech_awstranscribe.min.js
+++ /dev/null
@@ -1 +0,0 @@
-define(["jquery","core/log"],function(a,b){"use strict";return b.debug("speech_awstranscribe: initialising"),{recognition:null,recognizing:!1,ignore_onend:!1,final_transcript:"",start_timestamp:0,lang:"en-US",clone:function(){return a.extend(!0,{},this)},init:function(a){var b=b||webkitSpeechRecognition;this.recognition=new b,this.recognition.continuous=!0,this.recognition.interimResults=!0,this.lang=a?a:"en-US",this.register_events()},set_grammar:function(a){var b=b||webkitSpeechGrammarList;if(b){var c=new b;c.addFromString(a,1),this.recognition.grammars=c}},start:function(){this.recognizing||(this.recognizing=!0,this.final_transcript="",this.recognition.lang=this.lang,this.recognition.start(),this.ignore_onend=!1,this.start_timestamp=Date.now())},stop:function(){this.recognizing=!1,this.recognition.stop()},register_events:function(){var a=this.recognition,c=this;a.onstart=function(){c.recognizing=!0},a.onerror=function(a){"no-speech"==a.error&&(b.debug("info_no_speech"),c.ignore_onend=!0),"audio-capture"==a.error&&(b.debug("info_no_microphone"),c.ignore_onend=!0),"not-allowed"==a.error&&(a.timeStamp-c.start_timestamp<100?b.debug("info_blocked"):b.debug("info_denied"),c.ignore_onend=!0)},a.onend=function(){0!=c.recognizing&&(c.ignore_onend?c.recognizing=!1:a.start())},a.onresult=function(a){for(var b="",d=a.resultIndex;dStreaming Exception "+a.reason),toggleStartStop())}},handleEventStreamMessage:function(b){var c=b.Transcript.Results;if(c.length>0&&c[0].Alternatives.length>0){var d=c[0].Alternatives[0].Transcript;d=decodeURIComponent(escape(d)),a("#transcript").val(transcription+d+"\n"),c[0].IsPartial||(a("#transcript").scrollTop(a("#transcript")[0].scrollHeight),transcription+=d+"\n")}},convertAudioToBinaryMessage:function(a){var b=mic.toRaw(a);if(null!=b){var c=audioUtils.downsampleBuffer(b,sampleRate),d=audioUtils.pcmEncode(c),e=getAudioEventMessage(Buffer.from(d)),f=eventStreamMarshaller.marshall(e);return f}},getAudioEventMessage:function(a){return{headers:{":message-type":{type:"string",value:"event"},":event-type":{type:"string",value:"AudioEvent"}},body:a}},createPresignedUrl:function(){var b="transcribestreaming."+region+".amazonaws.com:8443";return v4.createPresignedURL("GET",b,"/stream-transcription-websocket","transcribe",crypto.createHash("sha256").update("","utf8").digest("hex"),{key:a("#access_id").val(),secret:a("#secret_key").val(),protocol:"wss",expires:15,region:region,query:"language-code="+languageCode+"&media-encoding=pcm&sample-rate="+sampleRate})},closeSocket:function(){if(socket.OPEN){micStream.stop();var a=getAudioEventMessage(Buffer.from(new Buffer([]))),b=eventStreamMarshaller.marshall(a);socket.send(b)}}}});
\ No newline at end of file
diff --git a/amd/build/speech_awstranscriber.min.js b/amd/build/speech_awstranscriber.min.js
new file mode 100644
index 00000000..deea8ba8
--- /dev/null
+++ b/amd/build/speech_awstranscriber.min.js
@@ -0,0 +1 @@
+define(["jquery","core/log"],function(a,b){"use strict";return b.debug("aws_instant: initialising"),{final_transcript:"",start_timestamp:0,transcriber:null,clone:function(){return a.extend(!0,{},this)},will_work_ok:function(a){var b=!1;switch(a.language){case"en-AU":case"en-GB":case"en-US":case"es-US":case"fr-FR":case"fr-CA":b=!0;break;default:b=!1}if(b)switch(a.region){case"useast1":case"useast2":case"uswest2":case"sydney":case"dublin":case"ottawa":b=!0;break;default:b=!1}return b},init:function(a){var b=this;require(["https://cdn.jsdelivr.net/gh/justinhunt/cloudpoodll@latest/amd/build/awstranscriber.min.js"],function(c){b.transcriber=c,a.expiretime=300,a.token=a.wstoken,a.wsserver=M.cfg.wwwroot,b.transcriber.init(a),b.register_events()})},set_grammar:function(a){},start:function(a){this.transcriber.active||(this.final_transcript="",this.transcriber.start(a,this.transcriber),this.ignore_onend=!1,this.start_timestamp=Date.now())},stop:function(){this.transcriber.active&&this.transcriber.stop(this.transcriber)},register_events:function(){var a=this;this.transcriber.onFinalResult=function(b,c){a.onfinalspeechcapture(b,c)}},onfinalspeechcapture:function(a,c){b.debug(a)},oninterimspeechcapture:function(a){}}});
\ No newline at end of file
diff --git a/amd/build/speech_browser.min.js b/amd/build/speech_browser.min.js
index 0a60d37d..6f1d236e 100644
--- a/amd/build/speech_browser.min.js
+++ b/amd/build/speech_browser.min.js
@@ -1 +1 @@
-define(["jquery","core/log"],function(a,b){"use strict";return b.debug("speech_browser: initialising"),{recognition:null,recognizing:!1,ignore_onend:!1,final_transcript:"",start_timestamp:0,lang:"en-US",clone:function(){return a.extend(!0,{},this)},init:function(a){var b=b||webkitSpeechRecognition;this.recognition=new b,this.recognition.continuous=!0,this.recognition.interimResults=!0,this.lang=a?a:"en-US",this.register_events()},set_grammar:function(a){var b=b||webkitSpeechGrammarList;if(b){var c=new b;c.addFromString(a,1),this.recognition.grammars=c}},start:function(){this.recognizing||(this.recognizing=!0,this.final_transcript="",this.recognition.lang=this.lang,this.recognition.start(),this.ignore_onend=!1,this.start_timestamp=Date.now())},stop:function(){this.recognizing=!1,this.recognition.stop()},register_events:function(){var a=this.recognition,c=this;a.onstart=function(){c.recognizing=!0},a.onerror=function(a){"no-speech"==a.error&&(b.debug("info_no_speech"),c.ignore_onend=!0),"audio-capture"==a.error&&(b.debug("info_no_microphone"),c.ignore_onend=!0),"not-allowed"==a.error&&(a.timeStamp-c.start_timestamp<100?b.debug("info_blocked"):b.debug("info_denied"),c.ignore_onend=!0)},a.onend=function(){0!=c.recognizing&&(c.ignore_onend?c.recognizing=!1:a.start())},a.onresult=function(a){for(var b="",d=a.resultIndex;d';
controls += '';
controls += '';
- controls += '';
- controls += '';
+
+ //removing bgus buttons from html is better. The bogus items will send events(do_play_stop). Thf jquery object refering to them is enough for bogus to work
+ //controls += '';
+ //controls += '';
+
/*
controls += '';
controls += '';
controls += '';
*/
+
controls += status,
controls += '';
$(element).prepend(controls);
diff --git a/amd/src/speech_awstranscriber.js b/amd/src/speech_awstranscriber.js
new file mode 100644
index 00000000..d0b69b7b
--- /dev/null
+++ b/amd/src/speech_awstranscriber.js
@@ -0,0 +1,127 @@
+/* jshint ignore:start */
+define(['jquery', 'core/log'], function ($, log) {
+
+ "use strict"; // jshint ;_;
+
+ log.debug('aws_instant: initialising');
+
+ return {
+
+ final_transcript: '',
+ start_timestamp: 0,
+ transcriber: null,
+
+
+ //for making multiple instances
+ clone: function () {
+ return $.extend(true, {}, this);
+ },
+
+ will_work_ok: function(opts){
+ var ret = false;
+
+ //The instance languages
+ switch(opts['language']){
+ case 'en-AU':
+ case 'en-GB':
+ case 'en-US':
+ case 'es-US':
+ case 'fr-FR':
+ case 'fr-CA':
+ ret =true;
+ break;
+ default:
+ ret = false;
+ }
+
+ //The supported regions
+ if(ret) {
+ switch (opts['region']) {
+ case "useast1":
+ case "useast2":
+ case "uswest2":
+ case "sydney":
+ case "dublin":
+ case "ottawa":
+ ret =true;
+ break;
+ default:
+ ret = false;
+ }
+ }
+ return ret;
+ },
+
+
+ init: function (opts) {
+ var that = this;
+ //require(['http://localhost/moodle/local/cpapi/cloudpoodll/amd/build/awstranscriber.min.js'],function(transcriber){
+ require(['https://cdn.jsdelivr.net/gh/justinhunt/cloudpoodll@latest/amd/build/awstranscriber.min.js'],function(transcriber){
+ that.transcriber = transcriber;
+ opts['expiretime'] = 300;
+ opts['token'] = opts['wstoken'];
+ opts['wsserver'] = M.cfg.wwwroot;
+ that.transcriber.init(opts);
+ that.register_events();
+ });
+
+ //init streaming transcriber
+ /*
+ var opts = {};
+ opts['language'] = lang;
+ opts['region'] = app.props.region;
+ opts['token'] = app.props.token;
+ opts['parent'] = app.props.parent;
+ opts['owner'] = app.props.owner;
+ opts['appid'] = app.props.appid;
+ opts['expiretime'] = app.props.expiretime;
+ */
+
+
+ },
+
+ set_grammar: function (grammar) {
+ /*
+ var SpeechGrammarList = SpeechGrammarList || webkitSpeechGrammarList;
+ if (SpeechGrammarList) {
+ var speechRecognitionList = new SpeechGrammarList();
+ speechRecognitionList.addFromString(grammar, 1);
+ this.recognition.grammars = speechRecognitionList;
+ }
+ */
+ },
+
+ start: function (stream) {
+ if (this.transcriber.active) {
+ return;
+ }
+ this.final_transcript = '';
+ this.transcriber.start(stream, this.transcriber);
+ this.ignore_onend = false;
+ this.start_timestamp = Date.now();//event.timeStamp;
+
+ },
+ stop: function () {
+ if (!this.transcriber.active) {
+ return;
+ }
+ this.transcriber.stop(this.transcriber);
+ },
+
+ register_events: function () {
+ var that=this;
+ this.transcriber.onFinalResult = function(speechtext, speechresults) {
+ that.onfinalspeechcapture(speechtext,speechresults);
+ };
+
+ },//end of register events
+
+ onfinalspeechcapture: function (speechtext,speechresults) {
+ log.debug(speechtext);
+ },
+ oninterimspeechcapture: function (speechtext) {
+ // log.debug(speechtext);
+ }
+
+ };//end of returned object
+});//total end
diff --git a/amd/src/speech_browser.js b/amd/src/speech_browser.js
index 375071d2..de5ff1dd 100644
--- a/amd/src/speech_browser.js
+++ b/amd/src/speech_browser.js
@@ -20,12 +20,16 @@ define(['jquery', 'core/log'], function ($, log) {
return $.extend(true, {}, this);
},
- init: function (lang) {
+ will_work_ok: function(opts){
+ return 'webkitSpeechRecognition' in window || 'SpeechRecognition' in window;
+ },
+
+ init: function (opts) {
var SpeechRecognition = SpeechRecognition || webkitSpeechRecognition;
this.recognition = new SpeechRecognition();
this.recognition.continuous = true;
this.recognition.interimResults = true;
- this.lang = lang ? lang : 'en-US';
+ this.lang = opts.language ? opts.language : 'en-US';
this.register_events();
},
@@ -39,7 +43,8 @@ define(['jquery', 'core/log'], function ($, log) {
}
},
- start: function () {
+ start: function (stream) {
+ //browser recognition does not actually need to the stream
if (this.recognizing) {
return;
}
@@ -106,7 +111,7 @@ define(['jquery', 'core/log'], function ($, log) {
for (var i = event.resultIndex; i < event.results.length; ++i) {
if (event.results[i].isFinal) {
that.final_transcript += event.results[i][0].transcript;
- that.onfinalspeechcapture(that.final_transcript);
+ that.onfinalspeechcapture(that.final_transcript,JSON.stringify(event.results));
that.final_transcript = '';
} else {
interim_transcript += event.results[i][0].transcript;
@@ -118,7 +123,7 @@ define(['jquery', 'core/log'], function ($, log) {
};
},//end of register events
- onfinalspeechcapture: function (speechtext) {
+ onfinalspeechcapture: function (speechtext,speechresults) {
log.debug(speechtext);
},
oninterimspeechcapture: function (speechtext) {
diff --git a/amd/src/speech_poodll.js b/amd/src/speech_poodll.js
index 8ef18a11..a79c5e82 100644
--- a/amd/src/speech_poodll.js
+++ b/amd/src/speech_poodll.js
@@ -1,5 +1,5 @@
/* jshint ignore:start */
-define(['jquery', 'core/log', 'filter_poodll/speech_browser'], function ($, log, browserrecognition) {
+define(['jquery', 'core/log', 'filter_poodll/speech_browser', 'filter_poodll/speech_awstranscriber'], function ($, log, browserrecognition, awstranscriber) {
"use strict"; // jshint ;_;
@@ -15,17 +15,53 @@ define(['jquery', 'core/log', 'filter_poodll/speech_browser'], function ($, log,
return $.extend(true, {}, this);
},
- supports_browser: function () {
- return 'webkitSpeechRecognition' in window || 'SpeechRecognition' in window;
+ //check that we can streaming transcribe
+ will_work_ok: function (opts) {
+ //a specific streaming transcriber might be requested
+ //if not , we can just use what is available
+ if('streamingtranscriber' in opts) {
+ switch (opts['streamingtranscriber']) {
+ case 'aws':
+ return awstranscriber.will_work_ok(opts);
+ case 'browser':
+ return browserrecognition.will_work_ok(opts);
+ }
+ }
+ //if no valid streamingtranscriber suggested just defaults
+ return ( browserrecognition.will_work_ok(opts) ||
+ awstranscriber.will_work_ok(opts));
+
},
- init: function (lang) {
- //in future we would like to have multiple recognizers presenting a single interface
- if ('webkitSpeechRecognition' in window || 'SpeechRecognition' in window) {
- this.recognizer = browserrecognition.clone();
- this.recognizer.init(lang);
- } else {
- log.debug('no usable speech recognizer found');
+ init: function (opts) {
+ //multiple recognizers presenting a single interface
+ //if a transcriber is requested we use that, otherwise we default to browser then AWS
+ if('streamingtranscriber' in opts){
+ switch (opts['streamingtranscriber']){
+ case 'aws':
+ this.recognizer = awstranscriber.clone();
+ this.recognizer.init(opts);
+ break;
+ case 'browser':
+ this.recognizer = browserrecognition.clone();
+ this.recognizer.init(opts);
+ break;
+ }
+ }
+
+ //if no hinted transcriber, just choose
+ if(!this.recognizer) {
+ if (browserrecognition.will_work_ok(opts)) {
+ this.recognizer = browserrecognition.clone();
+ this.recognizer.init(opts);
+ } else if (awstranscriber.will_work_ok(opts)) {
+ this.recognizer = awstranscriber.clone();
+ this.recognizer.init(opts)
+ } else {
+ //should never arrive here. supposed to check first
+ log.debug('no usable speech recognizer found');
+ return false;
+ }
}
},
@@ -33,14 +69,14 @@ define(['jquery', 'core/log', 'filter_poodll/speech_browser'], function ($, log,
this.recognizer.set_grammar(grammar);
},
- start: function () {
+ start: function (stream) {
if (!this.recognizer) {
return;
}
this.recognizer.onfinalspeechcapture = this.onfinalspeechcapture;
this.recognizer.oninterimspeechcapture = this.oninterimspeechcapture;
if (this.recognizer) {
- this.recognizer.start();
+ this.recognizer.start(stream);
}
},
stop: function () {
@@ -52,7 +88,7 @@ define(['jquery', 'core/log', 'filter_poodll/speech_browser'], function ($, log,
}
},
- onfinalspeechcapture: function (speechtext) {
+ onfinalspeechcapture: function (speechtext,speechresults) {
if (!this.recognizer) {
return;
}
diff --git a/classes/constants.php b/classes/constants.php
index 440e19d9..aaead0f6 100644
--- a/classes/constants.php
+++ b/classes/constants.php
@@ -10,6 +10,7 @@
class constants {
const MOD_FRANKY = 'filter_poodll';
+ const M_COMP = 'filter_poodll';
const AWS_V2 = '2.x';
const AWS_V3 = '3.x';
const AWS_NONE = 'none';
diff --git a/classes/diff.php b/classes/diff.php
new file mode 100644
index 00000000..fa819b42
--- /dev/null
+++ b/classes/diff.php
@@ -0,0 +1,583 @@
+is]] , [[dog,doggies,dogs],[dogs=>is]]
+ * When processing, the first item in the word array is matched to the passage word. If it matches, the subsequent items
+ * in the word array are matched to the transcript. If we have a transcript match, yay. If we have a transcript match AND
+ * it has a forward match. That will be returned so that the next pass of the match loop will accept that forward match
+ * as a match on the next passage word. This allows a passage "Dog" to be matched to "Dog's" and not flag the leftover "is" in the passage as incorrect.
+ *
+ * TO DO: For this whole alternates thing ...optimize so we only parse the passage once when its saved
+ * and store the index of a word with alternates, so we do not need to loop through the alternates array on checking
+ *
+ */
+ public static function fetchAlternativesArray($thealternates)
+ {
+ //return empty if input data is useless
+ if(trim($thealternates)==''){
+ return [];
+ }
+ //regexp from https://stackoverflow.com/questions/7058168/explode-textarea-php-at-new-lines
+ $lines = preg_split('/\r\n|[\r\n]/', $thealternates);
+ $alternatives = [];
+
+ foreach($lines as $line){
+ if(!empty(trim($line))) {
+ $set = explode('|', $line);
+ switch(count($set)){
+ case 0:
+ case 1:
+ break;
+ case 2:
+ default:
+ //clean each word in set
+ $forwardmatches= [];
+ $words= [];
+ foreach($set as $wordstring){
+ $wordstring = trim($wordstring);
+ if($wordstring==''){continue;}
+ $wordsarray=explode(' ',$wordstring);
+
+ $word = $wordsarray[0];
+ if($word !='*') {
+ $word = self::cleanText($word);
+ }
+ $words[]=$word;
+
+ if(count($wordsarray)>1 && $word !='*' && !is_number($word)){
+ $forwardmatches[$word]=self::cleanText($wordsarray[1]);
+ }
+ }
+ $alternatives[] = [$words,$forwardmatches];
+ }
+ }
+ }
+ return $alternatives;
+ }
+
+ //Do some adhoc match judgement based on common language transcription errors by AI
+ public static function generous_match($passageword,$transcriptword,$language){
+ $lang = substr($language,0,2);
+ switch($lang){
+ case 'en':
+ if(self::mb_strequals($passageword . 's', $transcriptword)){return true;}
+ if(self::mb_strequals($passageword . 'ed', $transcriptword)){return true;}
+ break;
+ default:
+ return false;
+ }
+ return false;
+ }
+
+ //Loop through passage, nest looping through transcript building collections of sequences (passage match records)
+ //one sequence = sequence_length[length] + sequence_start(transcript)[tposition] + sequence_start(passage)[pposition]
+ //we do not discriminate over length or position of sequence at this stage. All sequences are saved
+
+ //NB The sequence length should be the same in the passage and transcript (because they "matched")
+ //But we attempted to have "multiple word alternatives" which could mean that the match length in the transcript
+ // would differ from the match length in the passage
+ //eg 1989 -> nineteen eighty nine.
+ // BUT we cancelled this feature because the code became more complex than wanted to maintain,
+ // however still kept the transcript sequence length and passage sequence length code in place in this function
+ // so we could have another go at this if needed
+ //
+ //returns array of sequences
+ public static function fetchSequences($passage, $transcript, $alternatives, $language)
+ {
+ $p_length = count($passage);
+ $t_length = count($transcript);
+ $sequences = array();
+ $t_slength=0; //sequence length (in the transcript)
+ $p_slength=0; //sequence length (in the passage)
+ $alt_positions=[]; //we record alternate usages in sequence
+ $tstart =0; //transcript sequence match search start index
+ $forwardmatch=false; //if any alternates declare a forward match we keep that here
+
+
+ //loop through passage word by word
+ for($pstart =0; $pstart < $p_length; $pstart++){
+ //loop through transcript finding matches starting from current passage word
+ //we step over the length of any sequences we have already found to begin search for next sequence
+ while($t_slength + $tstart < $t_length &&
+ $p_slength + $pstart < $p_length
+ ) {
+ //get words to compare
+ $passageword= $passage[$p_slength + $pstart];
+ $transcriptword =$transcript[$t_slength + $tstart];
+ $match=false;
+
+ //check for a forward match
+ if($forwardmatch!==false){
+ $match = self::mb_strequals($passageword, $forwardmatch);
+ //we matched a passage word + but did not use the next transcript word, so roll back t_slength
+ if($match) {
+ $t_slength--;
+ }
+ }
+ $forwardmatch=false;
+
+ //check for a direct match
+ if(!$match) {
+ $match =self::mb_strequals( $passageword,$transcriptword);
+ }
+
+ //if no direct match is there an alternates match
+ if(!$match && $alternatives){
+ $altsearch_result = self::check_alternatives_for_match($passageword,
+ $transcriptword,
+ $alternatives);
+ if($altsearch_result->match){
+ $match= true;
+ $forwardmatch=$altsearch_result->forwardmatch;
+ $alt_positions[]=($p_slength + $pstart);
+ }
+ }//end of if no direct match
+
+ //else check for a generous match(eg for english +s and +ed we give it to them)
+ if(!$match){
+ $match= self::generous_match($passageword,$transcriptword,$language);
+ }
+
+ //if we have a match and the passage and transcript each have another word, we will continue
+ //(ie to try to match the next word)
+ if ($match &&
+ ($t_slength + $tstart + 1) < $t_length &&
+ ($p_slength + $pstart + 1) < $p_length ) {
+ //continue building sequence
+ $p_slength++;
+ $t_slength++;
+
+ //We add a provisional match here. This means lots of shorter sequences added to sequences[]
+ // on the way to building the final sequence
+ //this is necessary for an unusual case where two sequences overlap
+ //at the end of one and the beginning of the other.
+ //without a provisional match, the shorter seq. will lose the election and be unselected at fetchDiffs()
+ //and the unoverlapped part will be marked unmatched
+ //this occurs with a combination of wildcards and extraneous words in transcript
+ //eg transcript: home is where the heart resides oligarchy it stomach said ...
+ //passage: home is where the heart resides Aragaki Tsutomu said ...
+ //wildcards on Aragaki and Tsutomu caused this overlap problem
+ $sequence = new \stdClass();
+ $sequence->length = $p_slength;
+ $sequence->tlength = $t_slength;
+ $sequence->tposition = $tstart;
+ $sequence->pposition = $pstart;
+ $sequence->altpositions = $alt_positions;
+ $sequences[] = $sequence;
+
+ //else: no match or end of transcript/passage,
+ } else {
+ //if we have a match here, then its the last word of passage or transcript...
+ //we build our sequence object, store it in $sequences, and return
+ if($match){
+ $p_slength++;
+ $t_slength++;
+ $sequence = new \stdClass();
+ $sequence->length = $p_slength;
+ $sequence->tlength = $t_slength;
+ $sequence->tposition = $tstart;
+ $sequence->pposition = $pstart;
+ $sequence->altpositions = $alt_positions;
+ $sequences[] = $sequence;
+
+ //we bump tstart, which will end this loop
+ //and we reset our sequence lengths because the outer loop may yet continue
+ $tstart+= $t_slength;
+ $p_slength = 0;
+ $t_slength = 0;
+ $alt_positions =[];
+
+ //if we never even had a sequence we just move to next word in transcript
+ }elseif ($p_slength == 0) {
+ $tstart++;
+
+ //if we had a sequence but this is not a match, we build the sequence object, store it in $sequences,
+ //step transcript index and look for next sequence
+ } else {
+ $sequence = new \stdClass();
+ $sequence->length = $p_slength;
+ $sequence->tlength = $t_slength;
+ $sequence->tposition = $tstart;
+ $sequence->pposition = $pstart;
+ $sequence->altpositions = $alt_positions;
+ $sequences[] = $sequence;
+
+ //re init transcript loop variables for the next pass
+ $tstart+= $t_slength;
+ $p_slength = 0;
+ $t_slength = 0;
+ $alt_positions =[];
+
+ }//end of "IF slength=0"
+ }//end of "IF match"
+ }//end of "WHILE Transcript Index < t_length"
+ //reset transcript loop variables for each pass of passageword loop
+ $tstart=0;
+
+ }//end of "FOR each passage word"
+
+ return $sequences;
+ }//end of fetchSequences
+
+ public static function debug_print_sequence($sequence,$passage,$transcript,$tag){
+ echo ' ';
+ echo 'THE SEQUENCE: ' . $tag;
+ echo ' ';
+ print_r($sequence);
+ $printpassage = ' PASSAGE: ';
+ $printtranscript = ' TRANSCRIPT: ';
+ for($word=0;$word<$sequence->length;$word++){
+ $printpassage .= ($word . ':' . $passage[$word + $sequence->pposition] . ' ');
+ $printtranscript .= ($word . ':' . $transcript[$word + $sequence->tposition] . ' ');
+ }
+ echo $printpassage;
+ echo $printtranscript;
+ }
+
+ /*
+ * This will run through the list of alternatives for a given passageword
+ */
+ public static function check_alternatives_for_match($passageword,$transcriptword,$alternatives){
+ $ret= new \stdClass();
+ $ret->match =false;
+ $ret->matchlength=0;
+ $ret->forwardmatch=false;
+
+ //loop through all alternatives
+ //and then through each alternative->wordset
+ foreach($alternatives as $alternateset){
+ $wordset=$alternateset[0];
+ $forwardmatches=$alternateset[1];
+ if(self::mb_strequals($wordset[0],$passageword)){
+ for($setindex =1;$setindexmatch = true;
+ $ret->matchlength = 1;
+ if(array_key_exists($wordset[$setindex],$forwardmatches)){
+ $ret->forwardmatch=$forwardmatches[$wordset[$setindex]];
+ }
+ break;
+ }
+ }
+ }//end of if alternatesset[0]
+ if($ret->match){break;}
+ }//end of for each alternatives
+ //we return the matchlength
+ return $ret;
+ }
+
+ /*
+ * This will run through the alternatives and compile the wildcard words
+ * We put the passageword as array key , so later we can search for it by array_key_exists .. uurrgh
+ */
+ public static function fetchWildcardsArray($alternatives){
+ $wildcards=array();
+
+ //loop through all alternatives
+ //and then through each alternative->wordset
+ foreach($alternatives as $alternateset){
+ $wordset=$alternateset[0];
+ $passageword = $wordset[0];
+ for($setindex =1;$setindexlength == $b->length) {
+ if($a->tposition == $b->tposition){
+ return 0;
+ }else{
+ return ($a->tposition< $b->tposition) ? -1 : 1;
+ }
+ }
+ return ($a->length < $b->length) ? 1 : -1;
+ }
+
+ //returns an array of "diff" results, one for each word(ie position) in passage
+ //i) default all passage positions to unmatched (self::UNMATCHED)
+ //ii) sort sequences by length(longer sorts higher), transcript position (earlier sorts higher)
+ //iii) for each sequence
+ // a)- check passage match in sequence was not already matched by previous sequence (bust if so)
+ // b)- check transcript match in sequence[tpos -> tpos+length] was not already allocated to another part of passage in previous sequence
+ // c)- check passage match position and transcript position are consistent with previous sequences
+ // inconsistent example: If T position 3 was matched to P position 5, T position 4 could not match with P position 2
+ //iv) we do various adhoc checks based on common problems we find in the wild
+ //
+ //NB aborted supporting "multiple word alternatives" at this point. We know the sequence length in transcript
+ //but we can not add a valid tposition for a pposition in the final diff array when the pposition occurs
+ // after an alternate match in the same sequence. At that point gave up ... for now. Justin 2018/08
+ public static function fetchDiffs($sequences, $passagelength,$transcriptlength, $debug=false){
+ //i) default passage positions to unmatched and transcript position -1
+ $diffs=array_fill(0, $passagelength, [self::UNMATCHED,-1,self::NOTALTERNATEMATCH]);
+
+ //ii) sort sequences by length, transcript posn
+ //long sequences sort higher, and are placed in the diff array first
+ usort($sequences, array('\\' . constants::M_COMP . '\diff','cmp'));
+
+ //record prior sequences for iii)
+ $priorsequences=array();
+ $sequenceindex=0;
+ //iii) loop through sequences
+ foreach($sequences as $sequence){
+ $bust=false;
+ $sequenceindex++;
+
+ //iii) a) check passage position not already matched
+ //test with these sequences which should both match and not overlap
+ //A seq pposition=63 length=18
+ //B seq pposition=81 length=42
+ //remember that pposition is 0 based and so pposition=0 and length 1, is char 1 only
+ for($p=$sequence->pposition; $p < $sequence->pposition + $sequence->length; $p++){
+ if($diffs[$p][0] !=self::UNMATCHED){
+ $bust=true;
+ break;
+ }
+ }
+ if(!$bust){
+ foreach($priorsequences as $priorsequence){
+ //iii) b) check transcript match was not matched elsewhere in passage
+ if($sequence->tposition >= $priorsequence->tposition &&
+ $sequence->tposition < $priorsequence->tposition + $priorsequence->length){
+ $bust=true;
+ break;
+ }
+ //iii) c) check passsage match and transcript match positions are consistent with prev. sequences
+ if($sequence->tposition <= $priorsequence->tposition &&
+ $sequence->pposition >= $priorsequence->pposition){
+ $bust=true;
+ break;
+ }
+ if($sequence->tposition >= $priorsequence->tposition &&
+ $sequence->pposition <= $priorsequence->pposition){
+ $bust=true;
+ break;
+ }
+ }
+ }
+
+ //we do a fuzzy check for various anomalies that can occur
+ if(!$bust){
+ //distance from passage location to transcript location
+ $matchdistance =$sequence->pposition - $sequence->tposition;
+
+ //distance between passage location and transcript length
+ $enddistance =$sequence->pposition - $transcriptlength;
+
+ //ratio of alternates to full matches
+ $altcount = count($sequence->altpositions);
+ if($altcount) {
+ $altratio = $sequence->length / $altcount;
+ }else{
+ $altratio=0;
+ }
+
+ //common is short matches after speaking ends
+ //particularly dangerous are wildcards and alternates
+ if(($altratio >= 0.5) && $enddistance > 0){
+ $bust=true;
+ }elseif($sequence->length < $enddistance){
+ $bust=true;
+ }
+ }
+
+ if($bust){continue;}
+
+ //record sequence as :
+ //i) matched and
+ //ii) record transcript position so we can play it back.
+ //Then store sequence in prior sequences
+ for($p=$sequence->pposition; $p < $sequence->pposition + $sequence->length; $p++){
+ //word position in sequence ( 0 = first )
+ $wordposition = $p - $sequence->pposition;
+ //NB pposition starts from 1. We adjust tposition to match
+ $tposition = $sequence->tposition + $wordposition + 1;
+ //was this an alternatives match?
+ if(in_array($p,$sequence->altpositions)){
+ $altmatch=self::ALTERNATEMATCH;
+ }else{
+ $altmatch=self::NOTALTERNATEMATCH;
+ }
+
+ $diffs[$p]=[self::MATCHED,$tposition,$altmatch];
+ }
+ $priorsequences[] = $sequence;
+ }
+
+ //we are debugging return an array with some data we can look at
+ if($debug){
+ return [$diffs,$priorsequences];
+ }else{
+ return $diffs;
+ }
+ }
+
+ /*
+ * We apply wildcards after all is done.
+ * If we do it during the sequence building it can mess things up when a wildcard
+ * matches a passage word to a transcript word that should match elsewhere.
+ * e.g [passage] The big green butcher
+ * [transcript] The green butcher
+ * [alternatives] big|*
+ * In this case [transcript]green can be matched against [passage]big
+ * If the sequence containing this match is selected, then "green" can be marked as missing, and hence an error
+ *
+ * The sequence loop may or may not select the faulty sequence. Rather than patch this up with forward matches and
+ * tricks, we now leave wildcards out of sequence building and just patch up the diffs array here
+ *
+ * The same situation might occur with alternatives too, but the missed word is likely similar to the matched word
+ * e.g "The artists are this close to us." so we can accept it.
+ */
+ public static function applyWildcards($diffs,$passagebits,$wildcards){
+ $last_tposition=1;
+ $last_p=0;
+
+ //we do not want to go more than one beyond the last true matched passage word
+ //here we find the last passage match
+ for($p=count($diffs)-1;$p>=0;$p--){
+ if($diffs[$p][0]==self::MATCHED){
+ $last_p=$p;
+ break;
+ }
+ }
+ //If there is another passage word after that, it becomes the last possible wildcard match
+ if($last_p + 1
diff --git a/poodllloader.html b/poodllloader.html
index d89a3014..06a4a73c 100644
--- a/poodllloader.html
+++ b/poodllloader.html
@@ -9,9 +9,9 @@
-
+
-
+