From ddecc898e66a3ef038167d461ab4b03cf235aaa2 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 6 Feb 2024 13:20:15 -0500 Subject: [PATCH 1/4] Added chapter-level filtering; fixes https://github.com/sillsdev/serval/issues/150 --- src/SIL.Machine.AspNetCore/Models/Corpus.cs | 2 + .../Services/BiblicalRangeStringParser.cs | 165 ++++++++++++++++++ .../Services/NmtPreprocessBuildJob.cs | 25 ++- .../ServalTranslationEngineServiceV1.cs | 2 + .../BiblicalRangeStringParserTests.cs | 78 +++++++++ .../Services/NmtPreprocessBuildJobTests.cs | 78 +++++++++ .../Services/data/paratext2/41MATTen.SFM | 40 +++++ .../Services/data/paratext2/42MRKTen.SFM | 4 + .../data/paratext2/ProjectBiblicalTerms.xml | 6 + .../Services/data/paratext2/Settings.xml | 34 ++++ .../data/paratext2/TermRenderings.xml | 9 + .../Services/data/paratext2/custom.vrs | 31 ++++ tests/SIL.Machine.AspNetCore.Tests/Usings.cs | 1 + 13 files changed, 472 insertions(+), 3 deletions(-) create mode 100644 src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml create mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs diff --git a/src/SIL.Machine.AspNetCore/Models/Corpus.cs b/src/SIL.Machine.AspNetCore/Models/Corpus.cs index c33bc52ce..6680e42d4 100644 --- a/src/SIL.Machine.AspNetCore/Models/Corpus.cs +++ b/src/SIL.Machine.AspNetCore/Models/Corpus.cs @@ -7,6 +7,8 @@ public class Corpus public string TargetLanguage { get; set; } = default!; public bool TrainOnAll { get; set; } public bool PretranslateAll { get; set; } + public string? TrainOnBiblicalRange { get; set; } + public string? PretranslateBiblicalRange {get; set; } public HashSet TrainOnTextIds { get; set; } = default!; public HashSet PretranslateTextIds { get; set; } = default!; public List SourceFiles { get; set; } = default!; diff --git a/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs b/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs new file mode 100644 index 000000000..384ffac2d --- /dev/null +++ b/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs @@ -0,0 +1,165 @@ +class BiblicalRangeStringParser { + private readonly Dictionary _bookLengths = []; + private static readonly Regex CommaSeparatedBooks = new Regex(@"^([A-Z\d]{3}|OT|NT)(, ?([A-Z\d]{3}|OT|NT))*$", RegexOptions.Compiled); + private static readonly Regex BookRange = new Regex(@"^-?[A-Z\d]{3}-[A-Z\d]{3}$", RegexOptions.Compiled); + private static readonly Regex ChapterSelection = new Regex(@"^-?[A-Z\d]{3} ?(\d+|\d+-\d+)(, ?(\d+|\d+-\d+))*$", RegexOptions.Compiled); + + + public BiblicalRangeStringParser(ScrVers? versification = null){ + versification ??= ScrVers.Original; + foreach((string bookId, int bookNum) in Canon.AllBookIds.Zip(Canon.AllBookNumbers)){ + _bookLengths[bookId] = versification.GetLastChapter(bookNum); + } + } + + private Dictionary> ParseSection(string section){ + section = section.Trim(); + Dictionary> chaptersPerBook = []; + + //*Specific chapters from one book* + if (char.IsAsciiDigit(section.Last())){ + string bookName = section[..3]; + if (!_bookLengths.ContainsKey(bookName)){ + throw new ArgumentException($"{bookName} is an invalid book ID."); + } + + HashSet chapters = []; + + int lastChapter = _bookLengths[bookName]; + string[] chapterRangeStrings = section[3..].Split(','); + foreach(string chapterRangeString in chapterRangeStrings.Select(s => s.Trim())){ + if(chapterRangeString.Contains('-')){ + string[] startAndEnd = chapterRangeString.Split('-'); + int start, end; + if (!(int.TryParse(startAndEnd[0], out start) && int.TryParse(startAndEnd[1], out end))){ + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + if (start == 0 || end > lastChapter || end <= start){ + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + for(int chapterNum = start; chapterNum <= end; chapterNum++){ + chapters.Add(chapterNum); + } + } + else { + int chapterNum; + if (!int.TryParse(chapterRangeString, out chapterNum)){ + throw new ArgumentException($"{section} is an invalid chapter number."); + } + if (chapterNum > lastChapter){ + throw new ArgumentException($"{section} is an invalid chapter number."); + } + chapters.Add(chapterNum); + } + } + if (chapters.Count() == lastChapter){ + chaptersPerBook[bookName] = []; + } + else { + chaptersPerBook[bookName] = chapters.ToList(); + chaptersPerBook[bookName].Sort(); + } + } + //*Ranges of books to be added* + else if(section.Contains('-')){ + string[] startAndEnd = section.Split("-"); + if (startAndEnd.Length != 2 || !_bookLengths.ContainsKey(startAndEnd[0]) || !_bookLengths.ContainsKey(startAndEnd[1]) || Canon.BookIdToNumber(startAndEnd[1]) <= Canon.BookIdToNumber(startAndEnd[0])){ + throw new ArgumentException($"{section} is an invalid book range."); + } + for(int bookNum = Canon.BookIdToNumber(startAndEnd[0]); bookNum <= Canon.BookIdToNumber(startAndEnd[1]); bookNum++){ + chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; + } + } + //*OT* + else if(section == "OT"){ + for(int bookNum = 1; bookNum <= 39; bookNum++){ + chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; + } + } + //*NT* + else if(section == "NT"){ + for(int bookNum = 40; bookNum <= 66; bookNum++){ + chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; + } + } + //*Whole book* + else { + if(!_bookLengths.ContainsKey(section)){ + throw new ArgumentException($"{section} is an invalid book ID."); + } + chaptersPerBook[section] = []; + } + + return chaptersPerBook; + } + + public Dictionary> Parse(string chapterSelections){ + Dictionary> chaptersPerBook = []; + chapterSelections = chapterSelections.Trim(); + + char delimiter = ';'; + if(chapterSelections.Contains(';')){ + delimiter = ';'; + } + else if (CommaSeparatedBooks.IsMatch(chapterSelections)){ + delimiter = ','; + } + else if (!BookRange.IsMatch(chapterSelections) && ! ChapterSelection.IsMatch(chapterSelections)){ + throw new ArgumentException("Invalid syntax. If you are providing multiple selections, e.g. a range of books followed by a selection of chapters from a book, separate each selection with a semicolon."); + } + string[] selections = chapterSelections.Split(delimiter); + foreach (string section in selections.Select(s => s.Trim())){ + + //*Subtraction* + if (section.StartsWith('-')){ + Dictionary> sectionChapters = ParseSection(section[1..]); + foreach(string bookName in sectionChapters.Keys){ + if (!chaptersPerBook.ContainsKey(bookName)){ + throw new ArgumentException($"{bookName} cannot be removed as it is not in the existing book selection."); + } + + if (sectionChapters[bookName].Count() == 0){ + sectionChapters[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + if (chaptersPerBook[bookName].Count() == 0){ + chaptersPerBook[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + foreach(int chapterNumber in sectionChapters[bookName]){ + if(!chaptersPerBook[bookName].Remove(chapterNumber)){ + throw new ArgumentException($"{chapterNumber} cannot be removed as it is not in the existing chapter selection."); + } + } + + if (chaptersPerBook[bookName].Count() == 0){ + chaptersPerBook.Remove(bookName); + } + } + } + + //*Addition* + else { + Dictionary> sectionChapters = ParseSection(section); + foreach(string bookName in sectionChapters.Keys){ + if (chaptersPerBook.ContainsKey(bookName)){ + if(chaptersPerBook[bookName].Count() == 0 || sectionChapters[bookName].Count() == 0){ + chaptersPerBook[bookName] = []; + continue; + } + chaptersPerBook[bookName] = chaptersPerBook[bookName].Concat(sectionChapters[bookName]).Distinct().ToList(); + chaptersPerBook[bookName].Sort(); + if(chaptersPerBook[bookName].Count() == _bookLengths[bookName]){ + chaptersPerBook[bookName] = []; + } + } + else { + chaptersPerBook[bookName] = sectionChapters[bookName]; + } + + } + } + } + return chaptersPerBook; + } +} \ No newline at end of file diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index 4c4423777..be0d125b1 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -1,4 +1,5 @@ -using Google.Protobuf; +using System.Data; +using Google.Protobuf; using MongoDB.Bson.IO; namespace SIL.Machine.AspNetCore.Services; @@ -129,14 +130,32 @@ async IAsyncEnumerable ProcessRowsAsync() foreach (ParallelTextRow row in parallelCorpora.Flatten()) { - if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId)) + bool isInTrainOnRange = false; + bool isInPretranslateRange = false; + if(targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)){ + Dictionary> rowChaptersPerBook = row.Refs.Cast().GroupBy(vr => vr.Book).ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList()); + var parser = new BiblicalRangeStringParser(stc.Versification); + if(corpus.TrainOnBiblicalRange != null && corpus.TrainOnBiblicalRange != ""){ + Dictionary> trainOnBiblicalRangeChapters = parser.Parse(corpus.TrainOnBiblicalRange); //TODO calculate once + isInTrainOnRange = rowChaptersPerBook.Join(trainOnBiblicalRangeChapters, rcpb => rcpb.Key, tobrc => tobrc.Key, (rcbp, tobrc) => + rcbp.Value.Intersect(tobrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book + ).Any(b => b); + } + if(corpus.PretranslateBiblicalRange != null && corpus.PretranslateBiblicalRange != ""){ + Dictionary> pretranslateBiblicalRangeChapters = parser.Parse(corpus.PretranslateBiblicalRange); + isInPretranslateRange = rowChaptersPerBook.Join(pretranslateBiblicalRangeChapters, rcpb => rcpb.Key, pbrc => pbrc.Key, (rcbp, pbrc) => + rcbp.Value.Intersect(pbrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0) + ).Any(b => b); + } + } + if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnRange) { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); counts["NumTrainRows"] += 1; } if ( - (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId)) + (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId) || isInPretranslateRange) && row.SourceSegment.Count > 0 && row.TargetSegment.Count == 0 ) diff --git a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs index 0ea6542a6..07ee170f3 100644 --- a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs +++ b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs @@ -262,6 +262,8 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source) TargetLanguage = source.TargetLanguage, TrainOnAll = source.TrainOnAll, PretranslateAll = source.PretranslateAll, + TrainOnBiblicalRange = source.TrainOnBiblicalRange, + PretranslateBiblicalRange = source.PretranslateBiblicalRange, TrainOnTextIds = source.TrainOnTextIds.ToHashSet(), PretranslateTextIds = source.PretranslateTextIds.ToHashSet(), SourceFiles = source.SourceFiles.Select(Map).ToList(), diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs new file mode 100644 index 000000000..6482191fb --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs @@ -0,0 +1,78 @@ +namespace SIL.Machine.AspNetCore.Services; + +[TestFixture] +public class BiblicalRangeStringParserTests { + + [Test] + [TestCaseSource(nameof(GetCases))] + public void TestParse(string rangeString, Dictionary> expectedOutput, bool throwsException){ + var parser = new BiblicalRangeStringParser(); + if(!throwsException){ + Assert.That(parser.Parse(rangeString), Is.EquivalentTo(expectedOutput)); + } + else { + Assert.Throws(() => { + parser.Parse(rangeString); + }); + } + } + + public static IEnumerable GetCases(){ + yield return new TestCaseData("MAL", new Dictionary>{ {"MAL" , new List()}}, false); + yield return new TestCaseData("GEN,EXO", new Dictionary>{ {"GEN" , new List()},{"EXO" , new List()} }, false); + yield return new TestCaseData("1JN,2JN", new Dictionary>{ {"1JN" , new List()},{"2JN" , new List()} }, false); + yield return new TestCaseData("OT", Enumerable.Range(1, 39).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); + yield return new TestCaseData("NT", Enumerable.Range(40, 27).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); + yield return new TestCaseData("NT,OT", Enumerable.Range(1, 66).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); + yield return new TestCaseData("MAT;MRK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()} }, false); + yield return new TestCaseData("MAT; MRK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()} }, false); + yield return new TestCaseData("MAT1,2,3", new Dictionary>{ {"MAT" , new List(){1,2,3}} }, false); + yield return new TestCaseData("MAT1, 2, 3", new Dictionary>{ {"MAT" , new List(){1,2,3}} }, false); + yield return new TestCaseData("MAT-LUK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()} }, false); + yield return new TestCaseData("MAT1,2,3;MAT-LUK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()} }, false); + yield return new TestCaseData("2JN-3JN;EXO1,8,3-5;GEN", new Dictionary>{ {"GEN" , new List()},{"EXO" , new List(){1,3,4,5,8}},{"2JN" , new List()},{"3JN" , new List()} }, false); + yield return new TestCaseData("1JN 1;1JN 2;1JN 3-5", new Dictionary>{ {"1JN" , new List()}}, false); + yield return new TestCaseData("MAT-ROM;-ACT4-28", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()},{"JHN" , new List()},{"ACT" , new List(){1,2,3}},{"ROM" , new List()} }, false); + yield return new TestCaseData("2JN;-2JN 1", new Dictionary>{}, false); + yield return new TestCaseData("NT;OT;-MRK;-EXO", Enumerable.Range(1, 66).Where(i => i != 2 && i!= 41).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); + yield return new TestCaseData("NT;-MAT3-5,17;-REV21,22", Enumerable.Range(40, 27).Select(i => { + if (i == 40){ + return (Canon.BookNumberToId(i), Enumerable.Range(1,28).Where(c => !(c == 3 || c == 4 || c == 5 || c== 17)).ToList()); + } + if (i == 66){ + return (Canon.BookNumberToId(i), Enumerable.Range(1,20).ToList()); + } + return (Canon.BookNumberToId(i), new List()); + }).ToDictionary(), false); + yield return new TestCaseData("MAT-JHN;-MAT-LUK", new Dictionary>{ {"JHN" , new List()} }, false); + + + //*Throw exceptions + yield return new TestCaseData("MAT3-1", new Dictionary>(), true); + yield return new TestCaseData("MRK-MAT", new Dictionary>(), true); + yield return new TestCaseData("MRK;-MRK10-3", new Dictionary>(), true); + yield return new TestCaseData("MAT0-10", new Dictionary>(), true); + yield return new TestCaseData("MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("-MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("", new Dictionary>(), true); + yield return new TestCaseData("ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT-ABC", new Dictionary>(), true); + yield return new TestCaseData("NT;-ABC-LUK", new Dictionary>(), true); + yield return new TestCaseData("MAT 500", new Dictionary>(), true); + yield return new TestCaseData("MAT 1-500", new Dictionary>(), true); + yield return new TestCaseData("MAT;-MAT 300-500", new Dictionary>(), true); + yield return new TestCaseData("-MRK", new Dictionary>(), true); + yield return new TestCaseData("-MRK 1", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 1-4", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 6", new Dictionary>(), true); + yield return new TestCaseData("OT;-MRK-LUK", new Dictionary>(), true); + yield return new TestCaseData("NT;OT;-ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT;-ABC 1", new Dictionary>(), true); + yield return new TestCaseData("NT,OT,-MRK,-EXO", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT1", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT-LUK", new Dictionary>(), true); + + + } + +} \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index 3e01dccf1..4936b7f7f 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -10,12 +10,17 @@ public void SetUp() Path.Combine("..", "..", "..", "Services", "data", "paratext"), Path.Combine(Path.GetTempPath(), "Project.zip") ); + ZipFile.CreateFromDirectory( + Path.Combine("..", "..", "..", "Services", "data", "paratext2"), + Path.Combine(Path.GetTempPath(), "Project2.zip") + ); } [TearDown] public void TearDown() { File.Delete(Path.Combine(Path.GetTempPath(), "Project.zip")); + File.Delete(Path.Combine(Path.GetTempPath(), "Project2.zip")); } [Test] @@ -143,6 +148,79 @@ int numEntriesWrittenToPretranslate } } + [Test] + [TestCase("MAT","1CH",23,4)] + [TestCase("NT;LEV","1CH",25,4)] + [TestCase("OT","MRK",10,0)] + [TestCase("OT","MLK",0,0, true)] + public async Task BuildJobTest_Chapterlevel( + string trainOnBiblicalRangeChapters, + string pretranslateBiblicalRangeChapters, + int numLinesWrittenToTrain, + int numEntriesWrittenToPretranslate, + bool throwsException = false + ) + { + using var env = new TestEnvironment(); + var corpus1 = new Corpus + { + Id = "corpusId1", + SourceLanguage = "en", + TargetLanguage = "es", + PretranslateAll = false, + TrainOnAll = false, + PretranslateBiblicalRange = pretranslateBiblicalRangeChapters, + TrainOnBiblicalRange = trainOnBiblicalRangeChapters, + PretranslateTextIds = new HashSet(), + TrainOnTextIds = new HashSet(), + SourceFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project.zip") + } + }, + TargetFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + } + } + }; + var corpora = new ReadOnlyList(new List { corpus1 }); + if (!throwsException){ + await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); + } else { + Assert.ThrowsAsync(async () => { + await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); + }); + return; + } + using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/train.src.txt")) + { + using (var reader = new StreamReader(stream)) + { + //Split yields one more segment that there are new lines; thus, the "- 1" + string text = reader.ReadToEnd(); + Assert.That(text.Split("\n").Length - 1, Is.EqualTo(numLinesWrittenToTrain), text); + } + } + using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/pretranslate.src.json")) + { + using (var reader = new StreamReader(stream)) + { + JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize(reader.ReadToEnd()); + Assert.NotNull(pretranslationJsonObject); + Assert.That(pretranslationJsonObject!.ToList().Count, Is.EqualTo(numEntriesWrittenToPretranslate), JsonSerializer.Serialize(pretranslationJsonObject)); + } + } + } + private class TestEnvironment : DisposableBase { public ISharedFileService SharedFileService { get; } diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM new file mode 100644 index 000000000..83a1f6792 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/41MATTen.SFM @@ -0,0 +1,40 @@ +\id MAT - Test +\h Matthew +\mt Matthew +\ip An introduction to Matthew +\c 1 +\s Chapter One +\v 1 Chapter \pn one\+pro WON\+pro*\pn*, verse one.\f + \fr 1:1: \ft This is a footnote.\f* +\li1 +\v 2 \bd C\bd*hapter one, +\li2 verse\f + \fr 1:2: \ft This is a footnote.\f* two. +\v 3 Chapter one, +\li2 verse three. +\v 4 Chapter one,  +\li2 verse four, +\v 5 Chapter one, +\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. +\c 2 +\s1 Chapter Two +\p +\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. +\v 2-3 Chapter two, verse \fm ∆\fm*two. +\v 3-4a Chapter two, verse \w three|lemma\w*. +\v 4b Chapter two, verse four. +\p +\v 6 Chapter two, verse \w six|strong="12345" \w*. +\v 6 Bad verse. +\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*. +\v 7a Chapter two, verse seven A, +\s Section header +\p +\v 7b verse seven B. +\p +\v 8 This is a list: +\b +\tr \tc1 +\v 9 Chapter\tcr2 2\tc3 verse\tcr4 9 +\tr \tc1-2 +\v 10 \tc3-4 Chapter 2 verse 10 +\v 11-12 +\restore restore information diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM new file mode 100644 index 000000000..460009633 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/42MRKTen.SFM @@ -0,0 +1,4 @@ +\id MRK - Test +\h Mark +\mt Mark +\ip An introduction to Mark \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml new file mode 100644 index 000000000..8bdbc4d21 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/ProjectBiblicalTerms.xml @@ -0,0 +1,6 @@ + + + PN + Abba + + \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml new file mode 100644 index 000000000..268bde64c --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Ten + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + Project:Ten:ProjectBiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml new file mode 100644 index 000000000..debd73df5 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/TermRenderings.xml @@ -0,0 +1,9 @@ + + + Abba + + + + + + diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/data/paratext2/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/tests/SIL.Machine.AspNetCore.Tests/Usings.cs b/tests/SIL.Machine.AspNetCore.Tests/Usings.cs index 222a7a747..2a16362cf 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Usings.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Usings.cs @@ -21,4 +21,5 @@ global using SIL.Machine.Translation; global using SIL.Machine.Utils; global using SIL.ObjectModel; +global using SIL.Scripture; global using SIL.WritingSystems; From 3ca1e68a380a7cf274a74f04c6a307c6028e0a28 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 6 Feb 2024 13:20:15 -0500 Subject: [PATCH 2/4] Move scripture range parsing to Serval --- src/SIL.Machine.AspNetCore/Models/Corpus.cs | 4 +- .../Services/BiblicalRangeStringParser.cs | 165 ------------ .../Services/NmtPreprocessBuildJob.cs | 62 +++-- .../ServalTranslationEngineServiceV1.cs | 8 +- .../Scripture/ScriptureRangeParser.cs | 238 ++++++++++++++++++ .../BiblicalRangeStringParserTests.cs | 78 ------ .../Services/NmtPreprocessBuildJobTests.cs | 114 ++++++--- .../Scripture/ScriptureRangeParserTests.cs | 205 +++++++++++++++ 8 files changed, 572 insertions(+), 302 deletions(-) delete mode 100644 src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs create mode 100644 src/SIL.Machine/Scripture/ScriptureRangeParser.cs delete mode 100644 tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs create mode 100644 tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs diff --git a/src/SIL.Machine.AspNetCore/Models/Corpus.cs b/src/SIL.Machine.AspNetCore/Models/Corpus.cs index 6680e42d4..a40f3ff32 100644 --- a/src/SIL.Machine.AspNetCore/Models/Corpus.cs +++ b/src/SIL.Machine.AspNetCore/Models/Corpus.cs @@ -7,8 +7,8 @@ public class Corpus public string TargetLanguage { get; set; } = default!; public bool TrainOnAll { get; set; } public bool PretranslateAll { get; set; } - public string? TrainOnBiblicalRange { get; set; } - public string? PretranslateBiblicalRange {get; set; } + public Dictionary>? TrainOnChapters { get; set; } + public Dictionary>? PretranslateChapters { get; set; } public HashSet TrainOnTextIds { get; set; } = default!; public HashSet PretranslateTextIds { get; set; } = default!; public List SourceFiles { get; set; } = default!; diff --git a/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs b/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs deleted file mode 100644 index 384ffac2d..000000000 --- a/src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs +++ /dev/null @@ -1,165 +0,0 @@ -class BiblicalRangeStringParser { - private readonly Dictionary _bookLengths = []; - private static readonly Regex CommaSeparatedBooks = new Regex(@"^([A-Z\d]{3}|OT|NT)(, ?([A-Z\d]{3}|OT|NT))*$", RegexOptions.Compiled); - private static readonly Regex BookRange = new Regex(@"^-?[A-Z\d]{3}-[A-Z\d]{3}$", RegexOptions.Compiled); - private static readonly Regex ChapterSelection = new Regex(@"^-?[A-Z\d]{3} ?(\d+|\d+-\d+)(, ?(\d+|\d+-\d+))*$", RegexOptions.Compiled); - - - public BiblicalRangeStringParser(ScrVers? versification = null){ - versification ??= ScrVers.Original; - foreach((string bookId, int bookNum) in Canon.AllBookIds.Zip(Canon.AllBookNumbers)){ - _bookLengths[bookId] = versification.GetLastChapter(bookNum); - } - } - - private Dictionary> ParseSection(string section){ - section = section.Trim(); - Dictionary> chaptersPerBook = []; - - //*Specific chapters from one book* - if (char.IsAsciiDigit(section.Last())){ - string bookName = section[..3]; - if (!_bookLengths.ContainsKey(bookName)){ - throw new ArgumentException($"{bookName} is an invalid book ID."); - } - - HashSet chapters = []; - - int lastChapter = _bookLengths[bookName]; - string[] chapterRangeStrings = section[3..].Split(','); - foreach(string chapterRangeString in chapterRangeStrings.Select(s => s.Trim())){ - if(chapterRangeString.Contains('-')){ - string[] startAndEnd = chapterRangeString.Split('-'); - int start, end; - if (!(int.TryParse(startAndEnd[0], out start) && int.TryParse(startAndEnd[1], out end))){ - throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); - } - if (start == 0 || end > lastChapter || end <= start){ - throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); - } - for(int chapterNum = start; chapterNum <= end; chapterNum++){ - chapters.Add(chapterNum); - } - } - else { - int chapterNum; - if (!int.TryParse(chapterRangeString, out chapterNum)){ - throw new ArgumentException($"{section} is an invalid chapter number."); - } - if (chapterNum > lastChapter){ - throw new ArgumentException($"{section} is an invalid chapter number."); - } - chapters.Add(chapterNum); - } - } - if (chapters.Count() == lastChapter){ - chaptersPerBook[bookName] = []; - } - else { - chaptersPerBook[bookName] = chapters.ToList(); - chaptersPerBook[bookName].Sort(); - } - } - //*Ranges of books to be added* - else if(section.Contains('-')){ - string[] startAndEnd = section.Split("-"); - if (startAndEnd.Length != 2 || !_bookLengths.ContainsKey(startAndEnd[0]) || !_bookLengths.ContainsKey(startAndEnd[1]) || Canon.BookIdToNumber(startAndEnd[1]) <= Canon.BookIdToNumber(startAndEnd[0])){ - throw new ArgumentException($"{section} is an invalid book range."); - } - for(int bookNum = Canon.BookIdToNumber(startAndEnd[0]); bookNum <= Canon.BookIdToNumber(startAndEnd[1]); bookNum++){ - chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; - } - } - //*OT* - else if(section == "OT"){ - for(int bookNum = 1; bookNum <= 39; bookNum++){ - chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; - } - } - //*NT* - else if(section == "NT"){ - for(int bookNum = 40; bookNum <= 66; bookNum++){ - chaptersPerBook[Canon.BookNumberToId(bookNum)] = []; - } - } - //*Whole book* - else { - if(!_bookLengths.ContainsKey(section)){ - throw new ArgumentException($"{section} is an invalid book ID."); - } - chaptersPerBook[section] = []; - } - - return chaptersPerBook; - } - - public Dictionary> Parse(string chapterSelections){ - Dictionary> chaptersPerBook = []; - chapterSelections = chapterSelections.Trim(); - - char delimiter = ';'; - if(chapterSelections.Contains(';')){ - delimiter = ';'; - } - else if (CommaSeparatedBooks.IsMatch(chapterSelections)){ - delimiter = ','; - } - else if (!BookRange.IsMatch(chapterSelections) && ! ChapterSelection.IsMatch(chapterSelections)){ - throw new ArgumentException("Invalid syntax. If you are providing multiple selections, e.g. a range of books followed by a selection of chapters from a book, separate each selection with a semicolon."); - } - string[] selections = chapterSelections.Split(delimiter); - foreach (string section in selections.Select(s => s.Trim())){ - - //*Subtraction* - if (section.StartsWith('-')){ - Dictionary> sectionChapters = ParseSection(section[1..]); - foreach(string bookName in sectionChapters.Keys){ - if (!chaptersPerBook.ContainsKey(bookName)){ - throw new ArgumentException($"{bookName} cannot be removed as it is not in the existing book selection."); - } - - if (sectionChapters[bookName].Count() == 0){ - sectionChapters[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); - } - - if (chaptersPerBook[bookName].Count() == 0){ - chaptersPerBook[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); - } - - foreach(int chapterNumber in sectionChapters[bookName]){ - if(!chaptersPerBook[bookName].Remove(chapterNumber)){ - throw new ArgumentException($"{chapterNumber} cannot be removed as it is not in the existing chapter selection."); - } - } - - if (chaptersPerBook[bookName].Count() == 0){ - chaptersPerBook.Remove(bookName); - } - } - } - - //*Addition* - else { - Dictionary> sectionChapters = ParseSection(section); - foreach(string bookName in sectionChapters.Keys){ - if (chaptersPerBook.ContainsKey(bookName)){ - if(chaptersPerBook[bookName].Count() == 0 || sectionChapters[bookName].Count() == 0){ - chaptersPerBook[bookName] = []; - continue; - } - chaptersPerBook[bookName] = chaptersPerBook[bookName].Concat(sectionChapters[bookName]).Distinct().ToList(); - chaptersPerBook[bookName].Sort(); - if(chaptersPerBook[bookName].Count() == _bookLengths[bookName]){ - chaptersPerBook[bookName] = []; - } - } - else { - chaptersPerBook[bookName] = sectionChapters[bookName]; - } - - } - } - } - return chaptersPerBook; - } -} \ No newline at end of file diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index be0d125b1..1935bc2b9 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -130,32 +130,58 @@ async IAsyncEnumerable ProcessRowsAsync() foreach (ParallelTextRow row in parallelCorpora.Flatten()) { - bool isInTrainOnRange = false; - bool isInPretranslateRange = false; - if(targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)){ - Dictionary> rowChaptersPerBook = row.Refs.Cast().GroupBy(vr => vr.Book).ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList()); - var parser = new BiblicalRangeStringParser(stc.Versification); - if(corpus.TrainOnBiblicalRange != null && corpus.TrainOnBiblicalRange != ""){ - Dictionary> trainOnBiblicalRangeChapters = parser.Parse(corpus.TrainOnBiblicalRange); //TODO calculate once - isInTrainOnRange = rowChaptersPerBook.Join(trainOnBiblicalRangeChapters, rcpb => rcpb.Key, tobrc => tobrc.Key, (rcbp, tobrc) => - rcbp.Value.Intersect(tobrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book - ).Any(b => b); - } - if(corpus.PretranslateBiblicalRange != null && corpus.PretranslateBiblicalRange != ""){ - Dictionary> pretranslateBiblicalRangeChapters = parser.Parse(corpus.PretranslateBiblicalRange); - isInPretranslateRange = rowChaptersPerBook.Join(pretranslateBiblicalRangeChapters, rcpb => rcpb.Key, pbrc => pbrc.Key, (rcbp, pbrc) => - rcbp.Value.Intersect(pbrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0) - ).Any(b => b); + bool isInTrainOnChapters = false; + bool isInPretranslateChapters = false; + if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)) + { + Dictionary>? rowChaptersPerBook = null; + if (corpus.TrainOnChapters != null || corpus.PretranslateChapters != null) + { + rowChaptersPerBook = row + .Refs.Cast() + .GroupBy(vr => vr.Book) + .ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList()); + + if (corpus.TrainOnChapters != null) + { + isInTrainOnChapters = rowChaptersPerBook + .Join( + corpus.TrainOnChapters, + rcpb => rcpb.Key, + tobrc => tobrc.Key, + (rcbp, tobrc) => + rcbp.Value.Intersect(tobrc.Value).Count() > 0 + || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book + ) + .Any(b => b); + } + if (corpus.PretranslateChapters != null) + { + isInPretranslateChapters = rowChaptersPerBook + .Join( + corpus.PretranslateChapters, + rcpb => rcpb.Key, + pbrc => pbrc.Key, + (rcbp, pbrc) => + rcbp.Value.Intersect(pbrc.Value).Count() > 0 + || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0) + ) + .Any(b => b); + } } } - if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnRange) + if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters) { await sourceTrainWriter.WriteAsync($"{row.SourceText}\n"); await targetTrainWriter.WriteAsync($"{row.TargetText}\n"); counts["NumTrainRows"] += 1; } if ( - (corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId) || isInPretranslateRange) + ( + corpus.PretranslateAll + || corpus.PretranslateTextIds.Contains(row.TextId) + || isInPretranslateChapters + ) && row.SourceSegment.Count > 0 && row.TargetSegment.Count == 0 ) diff --git a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs index 07ee170f3..0239e4c69 100644 --- a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs +++ b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs @@ -262,8 +262,12 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source) TargetLanguage = source.TargetLanguage, TrainOnAll = source.TrainOnAll, PretranslateAll = source.PretranslateAll, - TrainOnBiblicalRange = source.TrainOnBiblicalRange, - PretranslateBiblicalRange = source.PretranslateBiblicalRange, + TrainOnChapters = source + .TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList())) + .ToDictionary(), + PretranslateChapters = source + .PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList())) + .ToDictionary(), TrainOnTextIds = source.TrainOnTextIds.ToHashSet(), PretranslateTextIds = source.PretranslateTextIds.ToHashSet(), SourceFiles = source.SourceFiles.Select(Map).ToList(), diff --git a/src/SIL.Machine/Scripture/ScriptureRangeParser.cs b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs new file mode 100644 index 000000000..04f7c6282 --- /dev/null +++ b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs @@ -0,0 +1,238 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using SIL.Extensions; +using SIL.Scripture; + +public class ScriptureRangeParser +{ + private readonly Dictionary _bookLengths = new Dictionary(); + private static readonly Regex CommaSeparatedBooks = new Regex( + @"^([A-Z\d]{3}|OT|NT)(, ?([A-Z\d]{3}|OT|NT))*$", + RegexOptions.Compiled + ); + private static readonly Regex BookRange = new Regex(@"^-?[A-Z\d]{3}-[A-Z\d]{3}$", RegexOptions.Compiled); + private static readonly Regex ChapterSelection = new Regex( + @"^-?[A-Z\d]{3} ?(\d+|\d+-\d+)(, ?(\d+|\d+-\d+))*$", + RegexOptions.Compiled + ); + + public ScriptureRangeParser(ScrVers versification = null) + { + if (versification == null) + versification = ScrVers.Original; + foreach ((string bookId, int bookNum) in Canon.AllBookIds.Zip(Canon.AllBookNumbers)) + { + _bookLengths[bookId] = versification.GetLastChapter(bookNum); + } + } + + private Dictionary> ParseSection(string section) + { + section = section.Trim(); + Dictionary> chaptersPerBook = new Dictionary>(); + + //*Specific chapters from one book* + if (char.IsDigit(section.Last())) + { + string bookName = section.Substring(0, 3); + if (!_bookLengths.ContainsKey(bookName)) + { + throw new ArgumentException($"{bookName} is an invalid book ID."); + } + + HashSet chapters = new HashSet(); + + int lastChapter = _bookLengths[bookName]; + string[] chapterRangeStrings = section.Substring(3).Split(','); + foreach (string chapterRangeString in chapterRangeStrings.Select(s => s.Trim())) + { + if (chapterRangeString.Contains('-')) + { + string[] startAndEnd = chapterRangeString.Split('-'); + int start, + end; + if (!(int.TryParse(startAndEnd[0], out start) && int.TryParse(startAndEnd[1], out end))) + { + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + if (start == 0 || end > lastChapter || end <= start) + { + throw new ArgumentException($"{chapterRangeString} is an invalid chapter range."); + } + for (int chapterNum = start; chapterNum <= end; chapterNum++) + { + chapters.Add(chapterNum); + } + } + else + { + int chapterNum; + if (!int.TryParse(chapterRangeString, out chapterNum)) + { + throw new ArgumentException($"{section} is an invalid chapter number."); + } + if (chapterNum > lastChapter) + { + throw new ArgumentException($"{section} is an invalid chapter number."); + } + chapters.Add(chapterNum); + } + } + if (chapters.Count() == lastChapter) + { + chaptersPerBook[bookName] = new List(); + } + else + { + chaptersPerBook[bookName] = chapters.ToList(); + chaptersPerBook[bookName].Sort(); + } + } + //*Ranges of books to be added* + else if (section.Contains('-')) + { + string[] startAndEnd = section.Split('-'); + if ( + startAndEnd.Length != 2 + || !_bookLengths.ContainsKey(startAndEnd[0]) + || !_bookLengths.ContainsKey(startAndEnd[1]) + || Canon.BookIdToNumber(startAndEnd[1]) <= Canon.BookIdToNumber(startAndEnd[0]) + ) + { + throw new ArgumentException($"{section} is an invalid book range."); + } + for ( + int bookNum = Canon.BookIdToNumber(startAndEnd[0]); + bookNum <= Canon.BookIdToNumber(startAndEnd[1]); + bookNum++ + ) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*OT* + else if (section == "OT") + { + for (int bookNum = 1; bookNum <= 39; bookNum++) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*NT* + else if (section == "NT") + { + for (int bookNum = 40; bookNum <= 66; bookNum++) + { + chaptersPerBook[Canon.BookNumberToId(bookNum)] = new List(); + } + } + //*Whole book* + else + { + if (!_bookLengths.ContainsKey(section)) + { + throw new ArgumentException($"{section} is an invalid book ID."); + } + chaptersPerBook[section] = new List(); + } + + return chaptersPerBook; + } + + public Dictionary> GetChapters(string chapterSelections) + { + Dictionary> chaptersPerBook = new Dictionary>(); + chapterSelections = chapterSelections.Trim(); + + char delimiter = ';'; + if (chapterSelections.Contains(';')) + { + delimiter = ';'; + } + else if (CommaSeparatedBooks.IsMatch(chapterSelections)) + { + delimiter = ','; + } + else if (!BookRange.IsMatch(chapterSelections) && !ChapterSelection.IsMatch(chapterSelections)) + { + throw new ArgumentException( + "Invalid syntax. If you are providing multiple selections, e.g. a range of books followed by a selection of chapters from a book, separate each selection with a semicolon." + ); + } + string[] selections = chapterSelections.Split(delimiter); + foreach (string section in selections.Select(s => s.Trim())) + { + //*Subtraction* + if (section.StartsWith("-")) + { + Dictionary> sectionChapters = ParseSection(section.Substring(1)); + foreach (string bookName in sectionChapters.Keys) + { + if (!chaptersPerBook.ContainsKey(bookName)) + { + throw new ArgumentException( + $"{bookName} cannot be removed as it is not in the existing book selection." + ); + } + + if (sectionChapters[bookName].Count() == 0) + { + sectionChapters[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + if (chaptersPerBook[bookName].Count() == 0) + { + chaptersPerBook[bookName] = Enumerable.Range(1, _bookLengths[bookName]).ToList(); + } + + foreach (int chapterNumber in sectionChapters[bookName]) + { + if (!chaptersPerBook[bookName].Remove(chapterNumber)) + { + throw new ArgumentException( + $"{chapterNumber} cannot be removed as it is not in the existing chapter selection." + ); + } + } + + if (chaptersPerBook[bookName].Count() == 0) + { + chaptersPerBook.Remove(bookName); + } + } + } + //*Addition* + else + { + Dictionary> sectionChapters = ParseSection(section); + foreach (string bookName in sectionChapters.Keys) + { + if (chaptersPerBook.ContainsKey(bookName)) + { + if (chaptersPerBook[bookName].Count() == 0 || sectionChapters[bookName].Count() == 0) + { + chaptersPerBook[bookName] = new List(); + continue; + } + chaptersPerBook[bookName] = chaptersPerBook[bookName] + .Concat(sectionChapters[bookName]) + .Distinct() + .ToList(); + chaptersPerBook[bookName].Sort(); + if (chaptersPerBook[bookName].Count() == _bookLengths[bookName]) + { + chaptersPerBook[bookName] = new List(); + } + } + else + { + chaptersPerBook[bookName] = sectionChapters[bookName]; + } + } + } + } + return chaptersPerBook; + } +} diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs deleted file mode 100644 index 6482191fb..000000000 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/BiblicalRangeStringParserTests.cs +++ /dev/null @@ -1,78 +0,0 @@ -namespace SIL.Machine.AspNetCore.Services; - -[TestFixture] -public class BiblicalRangeStringParserTests { - - [Test] - [TestCaseSource(nameof(GetCases))] - public void TestParse(string rangeString, Dictionary> expectedOutput, bool throwsException){ - var parser = new BiblicalRangeStringParser(); - if(!throwsException){ - Assert.That(parser.Parse(rangeString), Is.EquivalentTo(expectedOutput)); - } - else { - Assert.Throws(() => { - parser.Parse(rangeString); - }); - } - } - - public static IEnumerable GetCases(){ - yield return new TestCaseData("MAL", new Dictionary>{ {"MAL" , new List()}}, false); - yield return new TestCaseData("GEN,EXO", new Dictionary>{ {"GEN" , new List()},{"EXO" , new List()} }, false); - yield return new TestCaseData("1JN,2JN", new Dictionary>{ {"1JN" , new List()},{"2JN" , new List()} }, false); - yield return new TestCaseData("OT", Enumerable.Range(1, 39).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); - yield return new TestCaseData("NT", Enumerable.Range(40, 27).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); - yield return new TestCaseData("NT,OT", Enumerable.Range(1, 66).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); - yield return new TestCaseData("MAT;MRK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()} }, false); - yield return new TestCaseData("MAT; MRK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()} }, false); - yield return new TestCaseData("MAT1,2,3", new Dictionary>{ {"MAT" , new List(){1,2,3}} }, false); - yield return new TestCaseData("MAT1, 2, 3", new Dictionary>{ {"MAT" , new List(){1,2,3}} }, false); - yield return new TestCaseData("MAT-LUK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()} }, false); - yield return new TestCaseData("MAT1,2,3;MAT-LUK", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()} }, false); - yield return new TestCaseData("2JN-3JN;EXO1,8,3-5;GEN", new Dictionary>{ {"GEN" , new List()},{"EXO" , new List(){1,3,4,5,8}},{"2JN" , new List()},{"3JN" , new List()} }, false); - yield return new TestCaseData("1JN 1;1JN 2;1JN 3-5", new Dictionary>{ {"1JN" , new List()}}, false); - yield return new TestCaseData("MAT-ROM;-ACT4-28", new Dictionary>{ {"MAT" , new List()},{"MRK" , new List()},{"LUK" , new List()},{"JHN" , new List()},{"ACT" , new List(){1,2,3}},{"ROM" , new List()} }, false); - yield return new TestCaseData("2JN;-2JN 1", new Dictionary>{}, false); - yield return new TestCaseData("NT;OT;-MRK;-EXO", Enumerable.Range(1, 66).Where(i => i != 2 && i!= 41).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), false); - yield return new TestCaseData("NT;-MAT3-5,17;-REV21,22", Enumerable.Range(40, 27).Select(i => { - if (i == 40){ - return (Canon.BookNumberToId(i), Enumerable.Range(1,28).Where(c => !(c == 3 || c == 4 || c == 5 || c== 17)).ToList()); - } - if (i == 66){ - return (Canon.BookNumberToId(i), Enumerable.Range(1,20).ToList()); - } - return (Canon.BookNumberToId(i), new List()); - }).ToDictionary(), false); - yield return new TestCaseData("MAT-JHN;-MAT-LUK", new Dictionary>{ {"JHN" , new List()} }, false); - - - //*Throw exceptions - yield return new TestCaseData("MAT3-1", new Dictionary>(), true); - yield return new TestCaseData("MRK-MAT", new Dictionary>(), true); - yield return new TestCaseData("MRK;-MRK10-3", new Dictionary>(), true); - yield return new TestCaseData("MAT0-10", new Dictionary>(), true); - yield return new TestCaseData("MAT-FLUM", new Dictionary>(), true); - yield return new TestCaseData("-MAT-FLUM", new Dictionary>(), true); - yield return new TestCaseData("", new Dictionary>(), true); - yield return new TestCaseData("ABC", new Dictionary>(), true); - yield return new TestCaseData("MAT-ABC", new Dictionary>(), true); - yield return new TestCaseData("NT;-ABC-LUK", new Dictionary>(), true); - yield return new TestCaseData("MAT 500", new Dictionary>(), true); - yield return new TestCaseData("MAT 1-500", new Dictionary>(), true); - yield return new TestCaseData("MAT;-MAT 300-500", new Dictionary>(), true); - yield return new TestCaseData("-MRK", new Dictionary>(), true); - yield return new TestCaseData("-MRK 1", new Dictionary>(), true); - yield return new TestCaseData("MRK 2-5;-MRK 1-4", new Dictionary>(), true); - yield return new TestCaseData("MRK 2-5;-MRK 6", new Dictionary>(), true); - yield return new TestCaseData("OT;-MRK-LUK", new Dictionary>(), true); - yield return new TestCaseData("NT;OT;-ABC", new Dictionary>(), true); - yield return new TestCaseData("MAT;-ABC 1", new Dictionary>(), true); - yield return new TestCaseData("NT,OT,-MRK,-EXO", new Dictionary>(), true); - yield return new TestCaseData("OT,MAT1", new Dictionary>(), true); - yield return new TestCaseData("OT,MAT-LUK", new Dictionary>(), true); - - - } - -} \ No newline at end of file diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index 4936b7f7f..a59328883 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -149,10 +149,10 @@ int numEntriesWrittenToPretranslate } [Test] - [TestCase("MAT","1CH",23,4)] - [TestCase("NT;LEV","1CH",25,4)] - [TestCase("OT","MRK",10,0)] - [TestCase("OT","MLK",0,0, true)] + [TestCase("MAT", "1CH", 23, 4)] + [TestCase("NT;LEV", "1CH", 25, 4)] + [TestCase("OT", "MRK", 10, 0)] + [TestCase("OT", "MLK", 0, 0, true)] public async Task BuildJobTest_Chapterlevel( string trainOnBiblicalRangeChapters, string pretranslateBiblicalRangeChapters, @@ -162,45 +162,81 @@ public async Task BuildJobTest_Chapterlevel( ) { using var env = new TestEnvironment(); - var corpus1 = new Corpus + var parser = new ScriptureRangeParser(); + + Corpus corpus1 = new Corpus(); + if (throwsException) { - Id = "corpusId1", - SourceLanguage = "en", - TargetLanguage = "es", - PretranslateAll = false, - TrainOnAll = false, - PretranslateBiblicalRange = pretranslateBiblicalRangeChapters, - TrainOnBiblicalRange = trainOnBiblicalRangeChapters, - PretranslateTextIds = new HashSet(), - TrainOnTextIds = new HashSet(), - SourceFiles = new List + Assert.Throws(() => { - new CorpusFile + corpus1 = new Corpus { - TextId = "textId1", - Format = FileFormat.Paratext, - Location = Path.Combine(Path.GetTempPath(), "Project.zip") - } - }, - TargetFiles = new List + Id = "corpusId1", + SourceLanguage = "en", + TargetLanguage = "es", + PretranslateAll = false, + TrainOnAll = false, + PretranslateChapters = parser.GetChapters(pretranslateBiblicalRangeChapters), + TrainOnChapters = parser.GetChapters(trainOnBiblicalRangeChapters), + PretranslateTextIds = new HashSet(), + TrainOnTextIds = new HashSet(), + SourceFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project.zip") + } + }, + TargetFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + } + } + }; + }); + return; + } + else + { + corpus1 = new Corpus { - new CorpusFile + Id = "corpusId1", + SourceLanguage = "en", + TargetLanguage = "es", + PretranslateAll = false, + TrainOnAll = false, + PretranslateChapters = parser.GetChapters(pretranslateBiblicalRangeChapters), + TrainOnChapters = parser.GetChapters(trainOnBiblicalRangeChapters), + PretranslateTextIds = new HashSet(), + TrainOnTextIds = new HashSet(), + SourceFiles = new List { - TextId = "textId1", - Format = FileFormat.Paratext, - Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project.zip") + } + }, + TargetFiles = new List + { + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = Path.Combine(Path.GetTempPath(), "Project2.zip") + } } - } - }; - var corpora = new ReadOnlyList(new List { corpus1 }); - if (!throwsException){ - await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); - } else { - Assert.ThrowsAsync(async () => { - await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); - }); - return; + }; } + var corpora = new ReadOnlyList(new List { corpus1 }); + await env.BuildJob.RunAsync("engine1", "build1", corpora, "{\"use_key_terms\":false}", default); using (var stream = await env.SharedFileService.OpenReadAsync("builds/build1/train.src.txt")) { using (var reader = new StreamReader(stream)) @@ -216,7 +252,11 @@ public async Task BuildJobTest_Chapterlevel( { JsonArray? pretranslationJsonObject = JsonSerializer.Deserialize(reader.ReadToEnd()); Assert.NotNull(pretranslationJsonObject); - Assert.That(pretranslationJsonObject!.ToList().Count, Is.EqualTo(numEntriesWrittenToPretranslate), JsonSerializer.Serialize(pretranslationJsonObject)); + Assert.That( + pretranslationJsonObject!.ToList().Count, + Is.EqualTo(numEntriesWrittenToPretranslate), + JsonSerializer.Serialize(pretranslationJsonObject) + ); } } } diff --git a/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs b/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs new file mode 100644 index 000000000..4330f89ef --- /dev/null +++ b/tests/SIL.Machine.Tests/Scripture/ScriptureRangeParserTests.cs @@ -0,0 +1,205 @@ +using NUnit.Framework; +using SIL.Scripture; + +namespace SIL.Machine; + +[TestFixture] +public class ScriptureRangeParserTests +{ + [Test] + [TestCaseSource(nameof(GetCases))] + public void TestParse(string rangeString, Dictionary> expectedOutput, bool throwsException) + { + var parser = new ScriptureRangeParser(); + if (!throwsException) + { + Assert.That(parser.GetChapters(rangeString), Is.EquivalentTo(expectedOutput)); + } + else + { + Assert.Throws(() => + { + parser.GetChapters(rangeString); + }); + } + } + + public static IEnumerable GetCases() + { + yield return new TestCaseData("MAL", new Dictionary> { { "MAL", new List() } }, false); + yield return new TestCaseData( + "GEN,EXO", + new Dictionary> { { "GEN", new List() }, { "EXO", new List() } }, + false + ); + yield return new TestCaseData( + "1JN,2JN", + new Dictionary> { { "1JN", new List() }, { "2JN", new List() } }, + false + ); + yield return new TestCaseData( + "OT", + Enumerable.Range(1, 39).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "NT", + Enumerable.Range(40, 27).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "NT,OT", + Enumerable.Range(1, 66).Select(i => (Canon.BookNumberToId(i), new List())).ToDictionary(), + false + ); + yield return new TestCaseData( + "MAT;MRK", + new Dictionary> { { "MAT", new List() }, { "MRK", new List() } }, + false + ); + yield return new TestCaseData( + "MAT; MRK", + new Dictionary> { { "MAT", new List() }, { "MRK", new List() } }, + false + ); + yield return new TestCaseData( + "MAT1,2,3", + new Dictionary> + { + { + "MAT", + new List() { 1, 2, 3 } + } + }, + false + ); + yield return new TestCaseData( + "MAT1, 2, 3", + new Dictionary> + { + { + "MAT", + new List() { 1, 2, 3 } + } + }, + false + ); + yield return new TestCaseData( + "MAT-LUK", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() } + }, + false + ); + yield return new TestCaseData( + "MAT1,2,3;MAT-LUK", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() } + }, + false + ); + yield return new TestCaseData( + "2JN-3JN;EXO1,8,3-5;GEN", + new Dictionary> + { + { "GEN", new List() }, + { + "EXO", + new List() { 1, 3, 4, 5, 8 } + }, + { "2JN", new List() }, + { "3JN", new List() } + }, + false + ); + yield return new TestCaseData( + "1JN 1;1JN 2;1JN 3-5", + new Dictionary> { { "1JN", new List() } }, + false + ); + yield return new TestCaseData( + "MAT-ROM;-ACT4-28", + new Dictionary> + { + { "MAT", new List() }, + { "MRK", new List() }, + { "LUK", new List() }, + { "JHN", new List() }, + { + "ACT", + new List() { 1, 2, 3 } + }, + { "ROM", new List() } + }, + false + ); + yield return new TestCaseData("2JN;-2JN 1", new Dictionary> { }, false); + yield return new TestCaseData( + "NT;OT;-MRK;-EXO", + Enumerable + .Range(1, 66) + .Where(i => i != 2 && i != 41) + .Select(i => (Canon.BookNumberToId(i), new List())) + .ToDictionary(), + false + ); + yield return new TestCaseData( + "NT;-MAT3-5,17;-REV21,22", + Enumerable + .Range(40, 27) + .Select(i => + { + if (i == 40) + { + return ( + Canon.BookNumberToId(i), + Enumerable.Range(1, 28).Where(c => !(c == 3 || c == 4 || c == 5 || c == 17)).ToList() + ); + } + if (i == 66) + { + return (Canon.BookNumberToId(i), Enumerable.Range(1, 20).ToList()); + } + return (Canon.BookNumberToId(i), new List()); + }) + .ToDictionary(), + false + ); + yield return new TestCaseData( + "MAT-JHN;-MAT-LUK", + new Dictionary> { { "JHN", new List() } }, + false + ); + + //*Throw exceptions + yield return new TestCaseData("MAT3-1", new Dictionary>(), true); + yield return new TestCaseData("MRK-MAT", new Dictionary>(), true); + yield return new TestCaseData("MRK;-MRK10-3", new Dictionary>(), true); + yield return new TestCaseData("MAT0-10", new Dictionary>(), true); + yield return new TestCaseData("MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("-MAT-FLUM", new Dictionary>(), true); + yield return new TestCaseData("", new Dictionary>(), true); + yield return new TestCaseData("ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT-ABC", new Dictionary>(), true); + yield return new TestCaseData("NT;-ABC-LUK", new Dictionary>(), true); + yield return new TestCaseData("MAT 500", new Dictionary>(), true); + yield return new TestCaseData("MAT 1-500", new Dictionary>(), true); + yield return new TestCaseData("MAT;-MAT 300-500", new Dictionary>(), true); + yield return new TestCaseData("-MRK", new Dictionary>(), true); + yield return new TestCaseData("-MRK 1", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 1-4", new Dictionary>(), true); + yield return new TestCaseData("MRK 2-5;-MRK 6", new Dictionary>(), true); + yield return new TestCaseData("OT;-MRK-LUK", new Dictionary>(), true); + yield return new TestCaseData("NT;OT;-ABC", new Dictionary>(), true); + yield return new TestCaseData("MAT;-ABC 1", new Dictionary>(), true); + yield return new TestCaseData("NT,OT,-MRK,-EXO", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT1", new Dictionary>(), true); + yield return new TestCaseData("OT,MAT-LUK", new Dictionary>(), true); + } +} From fe05db770089578ec5e17470563d37b247ca9421 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 6 Feb 2024 13:20:15 -0500 Subject: [PATCH 3/4] Changes as per review comments --- src/SIL.Machine.AspNetCore/Models/Corpus.cs | 4 +- .../Services/NmtPreprocessBuildJob.cs | 44 +++++-------------- .../ServalTranslationEngineServiceV1.cs | 4 +- .../Scripture/ScriptureRangeParser.cs | 5 +++ .../Services/NmtPreprocessBuildJobTests.cs | 20 +++++++-- 5 files changed, 35 insertions(+), 42 deletions(-) diff --git a/src/SIL.Machine.AspNetCore/Models/Corpus.cs b/src/SIL.Machine.AspNetCore/Models/Corpus.cs index a40f3ff32..bf741e298 100644 --- a/src/SIL.Machine.AspNetCore/Models/Corpus.cs +++ b/src/SIL.Machine.AspNetCore/Models/Corpus.cs @@ -7,8 +7,8 @@ public class Corpus public string TargetLanguage { get; set; } = default!; public bool TrainOnAll { get; set; } public bool PretranslateAll { get; set; } - public Dictionary>? TrainOnChapters { get; set; } - public Dictionary>? PretranslateChapters { get; set; } + public Dictionary>? TrainOnChapters { get; set; } + public Dictionary>? PretranslateChapters { get; set; } public HashSet TrainOnTextIds { get; set; } = default!; public HashSet PretranslateTextIds { get; set; } = default!; public List SourceFiles { get; set; } = default!; diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index 1935bc2b9..aa52d44a1 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -132,43 +132,19 @@ async IAsyncEnumerable ProcessRowsAsync() { bool isInTrainOnChapters = false; bool isInPretranslateChapters = false; - if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)) + if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc) { - Dictionary>? rowChaptersPerBook = null; - if (corpus.TrainOnChapters != null || corpus.PretranslateChapters != null) + bool IsInChapters(Dictionary> bookChapters, object rowRef) { - rowChaptersPerBook = row - .Refs.Cast() - .GroupBy(vr => vr.Book) - .ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList()); - - if (corpus.TrainOnChapters != null) - { - isInTrainOnChapters = rowChaptersPerBook - .Join( - corpus.TrainOnChapters, - rcpb => rcpb.Key, - tobrc => tobrc.Key, - (rcbp, tobrc) => - rcbp.Value.Intersect(tobrc.Value).Count() > 0 - || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book - ) - .Any(b => b); - } - if (corpus.PretranslateChapters != null) - { - isInPretranslateChapters = rowChaptersPerBook - .Join( - corpus.PretranslateChapters, - rcpb => rcpb.Key, - pbrc => pbrc.Key, - (rcbp, pbrc) => - rcbp.Value.Intersect(pbrc.Value).Count() > 0 - || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0) - ) - .Any(b => b); - } + if (rowRef is not VerseRef vr) + return false; + return bookChapters.TryGetValue(vr.Book, out HashSet? chapters) + && (chapters.Contains(vr.ChapterNum) || chapters.Count() == 0); } + if (corpus.TrainOnChapters is not null) + isInTrainOnChapters = row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r)); + if (corpus.PretranslateChapters is not null) + isInPretranslateChapters = row.Refs.Any(r => IsInChapters(corpus.PretranslateChapters, r)); } if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters) { diff --git a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs index 0239e4c69..c1c7fbf5d 100644 --- a/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs +++ b/src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs @@ -263,10 +263,10 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source) TrainOnAll = source.TrainOnAll, PretranslateAll = source.PretranslateAll, TrainOnChapters = source - .TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList())) + .TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet())) .ToDictionary(), PretranslateChapters = source - .PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList())) + .PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToHashSet())) .ToDictionary(), TrainOnTextIds = source.TrainOnTextIds.ToHashSet(), PretranslateTextIds = source.PretranslateTextIds.ToHashSet(), diff --git a/src/SIL.Machine/Scripture/ScriptureRangeParser.cs b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs index 04f7c6282..e3f78ba3b 100644 --- a/src/SIL.Machine/Scripture/ScriptureRangeParser.cs +++ b/src/SIL.Machine/Scripture/ScriptureRangeParser.cs @@ -18,6 +18,11 @@ public class ScriptureRangeParser RegexOptions.Compiled ); + public static Dictionary> GetChapters(string chapterSelections, ScrVers versification = null) + { + return new ScriptureRangeParser(versification).GetChapters(chapterSelections); + } + public ScriptureRangeParser(ScrVers versification = null) { if (versification == null) diff --git a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs index a59328883..0868ba263 100644 --- a/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs +++ b/tests/SIL.Machine.AspNetCore.Tests/Services/NmtPreprocessBuildJobTests.cs @@ -176,8 +176,14 @@ public async Task BuildJobTest_Chapterlevel( TargetLanguage = "es", PretranslateAll = false, TrainOnAll = false, - PretranslateChapters = parser.GetChapters(pretranslateBiblicalRangeChapters), - TrainOnChapters = parser.GetChapters(trainOnBiblicalRangeChapters), + PretranslateChapters = parser + .GetChapters(pretranslateBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + TrainOnChapters = parser + .GetChapters(trainOnBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), PretranslateTextIds = new HashSet(), TrainOnTextIds = new HashSet(), SourceFiles = new List @@ -211,8 +217,14 @@ public async Task BuildJobTest_Chapterlevel( TargetLanguage = "es", PretranslateAll = false, TrainOnAll = false, - PretranslateChapters = parser.GetChapters(pretranslateBiblicalRangeChapters), - TrainOnChapters = parser.GetChapters(trainOnBiblicalRangeChapters), + PretranslateChapters = parser + .GetChapters(pretranslateBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), + TrainOnChapters = parser + .GetChapters(trainOnBiblicalRangeChapters) + .Select(kvp => (kvp.Key, kvp.Value.ToHashSet())) + .ToDictionary(), PretranslateTextIds = new HashSet(), TrainOnTextIds = new HashSet(), SourceFiles = new List From b9a9aab809f460017240cbb406e52e665fd5b8b5 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 6 Feb 2024 13:20:15 -0500 Subject: [PATCH 4/4] Count() to Count --- src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index aa52d44a1..3b21dda74 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -139,7 +139,7 @@ bool IsInChapters(Dictionary> bookChapters, object rowRef) if (rowRef is not VerseRef vr) return false; return bookChapters.TryGetValue(vr.Book, out HashSet? chapters) - && (chapters.Contains(vr.ChapterNum) || chapters.Count() == 0); + && (chapters.Contains(vr.ChapterNum) || chapters.Count == 0); } if (corpus.TrainOnChapters is not null) isInTrainOnChapters = row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r));