diff --git a/server/scripts/populate-courses.ts b/server/scripts/populate-courses.ts index ce84073d..376c32c5 100644 --- a/server/scripts/populate-courses.ts +++ b/server/scripts/populate-courses.ts @@ -610,6 +610,7 @@ export const addCourseDescription = async (course): Promise => { */ export const addAllSimilarityData = async (): Promise => { try { + console.log("adding similarity data"); const courses = await Classes.find().exec(); if (courses) { for (const course of courses) { @@ -637,7 +638,21 @@ const addSimilarityData = async (courses, course): Promise => { const similarities = []; const tfidf = await RecommendationMetadata.findOne({ _id: courseId }).exec(); for (const c of courses) { - if (c._id !== courseId && !c.crossList.includes(courseId) && c.classRating !== null && c.classRating !== 0) { + let crossList = false; + for (const crosslist of c.crossList) { + if (similarities.some(sim => sim._id === crosslist)) { + crossList = true; + break; + } + } + if ( + c._id !== courseId && + !c.crossList.includes(courseId) && + c.classRating && + c.classRating !== null && + c.classRating !== 0 && + !crossList + ) { const compTfidf = await RecommendationMetadata.findOne({ _id: c._id }).exec(); const cos = cosineSimilarity(tfidf.tfidfVector, compTfidf.tfidfVector); if (cos < 1) { diff --git a/server/scripts/populate-recdata.ts b/server/scripts/populate-recdata.ts index 73b2d420..a43403b2 100644 --- a/server/scripts/populate-recdata.ts +++ b/server/scripts/populate-recdata.ts @@ -9,6 +9,7 @@ import { preprocess, idf, tfidf } from '../src/course/course.recalgo'; */ export const addAllProcessedDescriptions = async (): Promise => { try { + console.log("adding processed descriptions"); const courses = await Classes.find().exec(); if (courses) { for (const course of courses) { @@ -35,7 +36,6 @@ const addProcessedDescription = async (course): Promise => { const subject = course.classSub; const num = course.classNum; try { - console.log(`${subject} ${num}: ${processed}`) const rec = await RecommendationMetadata.findOne({ _id: courseId }); if (rec) { await RecommendationMetadata.updateOne( @@ -58,6 +58,7 @@ const addProcessedDescription = async (course): Promise => { throw new Error(); } } + console.log(`${subject} ${num}`); return true; } catch (err) { console.log(`Error in adding processed description for ${subject} ${num}: ${err}`); @@ -71,10 +72,12 @@ const addProcessedDescription = async (course): Promise => { */ export const addIdfVector = async (): Promise => { try { + console.log("adding idf vector"); const metadata = await RecommendationMetadata.find().exec(); const descriptions = metadata.map(course => course.processedDescription.split(' ')); const allTerms = [...new Set(descriptions.flat())]; const idfValues = idf(allTerms, descriptions); + await GlobalMetadata.deleteMany({}); const res = await new GlobalMetadata({ idfVector: idfValues }).save(); @@ -97,6 +100,7 @@ export const addIdfVector = async (): Promise => { */ export const addAllTfIdfVectors = async (): Promise => { try { + console.log("adding tfidf vectors"); const courses = await RecommendationMetadata.find().exec(); const global = await GlobalMetadata.findOne().exec(); const idfVector = global.idfVector; diff --git a/server/src/course/course.recalgo.ts b/server/src/course/course.recalgo.ts index d0b56064..36dca585 100644 --- a/server/src/course/course.recalgo.ts +++ b/server/src/course/course.recalgo.ts @@ -14,30 +14,32 @@ const stemWord = (word) => { return word; } +const cleanWords = (sentence: string, fillerWords: string[]) => + sentence + .match(/\b\w+\b/g) + ?.map(word => { + let singularWord = stemWord(word.toLowerCase()); + return fillerWords.includes(singularWord) ? '' : singularWord; + }) + .filter(Boolean) + .join(' '); + /** * Preprocesses the description to remove pluralities and unnecessary punctuation * @param description A course description that needs to be preprocessed * @returns The processed description for a course */ export const preprocess = (description: string) => { - const sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description]; - const fillerWords = ["and", "the", "to", "for", "with"]; + const fillerWords = ["and", "the", "to", "for", "with", "it", "you", "not", "but", "have", "been", "of", "all", "in", "your", "their", "do", "this", "a", "is", "be", "will"]; + const sentences = description.match(/[^.!?]+[.!?]*/g) || [description]; const processedText = sentences.map(sentence => { - const words = sentence.match(/\b\w+\b/g) || []; - const cleanedWords = words.map(word => { - let singularWord = stemWord(word.toLowerCase()); - fillerWords.forEach(filler => { - const regex = new RegExp(`\\b${filler}\\b`, 'g'); - singularWord = singularWord.replace(regex, ''); - }); - return singularWord.replace(/[^\w\s]/g, ''); - }); - return cleanedWords.join(' '); + const cleaned = cleanWords(sentence, fillerWords); + return cleaned; }); - return processedText.join('. '); -} + return processedText.join('. ').trim(); +}; /** * Calculates the inverse document frequency for the given terms