From 4e53fd6bd8ebe3cef86d555ea99cd7d652106582 Mon Sep 17 00:00:00 2001 From: Ezequiel Santos Date: Mon, 12 Aug 2024 09:59:56 +0100 Subject: [PATCH] added NSCache --- .../Impl/GoogleScholarFetcher.swift | 207 ++++++++++-------- .../GoogleScholarSwift/Models/Article.swift | 11 +- .../Models/AuthorMetrics.swift | 2 +- .../Models/GoogleScholarCacheConfig.swift | 37 ++++ .../Models/GoogleScholarID.swift | 2 +- 5 files changed, 166 insertions(+), 93 deletions(-) create mode 100644 Sources/GoogleScholarSwift/Models/GoogleScholarCacheConfig.swift diff --git a/Sources/GoogleScholarSwift/Impl/GoogleScholarFetcher.swift b/Sources/GoogleScholarSwift/Impl/GoogleScholarFetcher.swift index 1a175f3..e207c3e 100644 --- a/Sources/GoogleScholarSwift/Impl/GoogleScholarFetcher.swift +++ b/Sources/GoogleScholarSwift/Impl/GoogleScholarFetcher.swift @@ -4,17 +4,34 @@ import SwiftSoup /// A class responsible for fetching data from Google Scholar. public class GoogleScholarFetcher { private let session: URLSession - private var publicationCache: [GoogleScholarID: [Publication]] = [:] - - // MARK: Public Methods - - /// Initializes a new instance of `GoogleScholarFetcher` with a custom URL session. + private let publicationCache = NSCache() + private let articleCache = NSCache() + + // MARK: - Initializer + + /// Initializes a new instance of `GoogleScholarFetcher` with a custom URL session and cache configuration. /// - /// - Parameter session: A custom URL session. Defaults to `.shared`. - public init(session: URLSession = .shared) { + /// - Parameters: + /// - session: A custom URL session. Defaults to `.shared`. + /// - cacheConfig: The configuration for the cache. Defaults to `.default`. + public init(session: URLSession = .shared, cacheConfig: GoogleScholarCacheConfig = .default) { self.session = session + self.configureCache(with: cacheConfig) } - + + /// Configures the NSCache with the provided configuration. + /// + /// - Parameter cacheConfig: The configuration for the cache. + private func configureCache(with cacheConfig: GoogleScholarCacheConfig) { + publicationCache.countLimit = cacheConfig.publicationCountLimit + publicationCache.totalCostLimit = cacheConfig.publicationTotalCostLimit + + articleCache.countLimit = cacheConfig.articleCountLimit + articleCache.totalCostLimit = cacheConfig.articleTotalCostLimit + } + + // MARK: - Public Methods + /// Fetches all publications for a given author from Google Scholar. /// /// - Parameters: @@ -35,6 +52,12 @@ public class GoogleScholarFetcher { fetchQuantity: FetchQuantity = .all, sortBy: SortBy = .cited ) async throws -> [Publication] { + + let cacheKey = "\(authorID.value)-\(fetchQuantity)-\(sortBy.rawValue)" as NSString + if let cachedPublications = publicationCache.object(forKey: cacheKey) as? [Publication] { + return cachedPublications + } + var allPublications: [Publication] = [] var startIndex = 0 let pageSize = 100 @@ -64,14 +87,7 @@ public class GoogleScholarFetcher { throw NSError(domain: "Invalid URL Components", code: 0, userInfo: nil) } - var request = URLRequest(url: url) - request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent") - request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer") - for (header, value) in Constants.headers { - request.addValue(value, forHTTPHeaderField: header) - } - applyCookies(to: &request) - + let request = configureRequest(with: url) let (data, _) = try await session.data(for: request) guard let html = String(data: data, encoding: .utf8) else { @@ -111,9 +127,10 @@ public class GoogleScholarFetcher { } } + publicationCache.setObject(mutablePublications as NSArray, forKey: cacheKey) return mutablePublications } - + /// Fetches the detailed information for a specific article. /// /// - Parameters: @@ -129,18 +146,16 @@ public class GoogleScholarFetcher { /// print(article) /// ``` public func fetchArticle(articleLink: ArticleLink) async throws -> Article { - guard let url = URL(string: articleLink.value) else { - throw NSError(domain: "Invalid URL", code: 0, userInfo: nil) + let cacheKey = articleLink.value as NSString + if let cachedArticle = articleCache.object(forKey: cacheKey) { + return cachedArticle } - var request = URLRequest(url: url) - request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent") - request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer") - for (header, value) in Constants.headers { - request.addValue(value, forHTTPHeaderField: header) + guard let url = URL(string: articleLink.value) else { + throw NSError(domain: "Invalid URL", code: 0, userInfo: nil) } - applyCookies(to: &request) + let request = configureRequest(with: url) let (data, _) = try await session.data(for: request) guard let html = String(data: data, encoding: .utf8) else { @@ -149,36 +164,11 @@ public class GoogleScholarFetcher { let doc: Document = try SwiftSoup.parse(html) let article = try self.parseArticle(doc) - return article - } - - /// Parses the publication data from the HTML document. - /// - /// - Parameter doc: The HTML document to parse. - /// - Returns: An array of `Publication` objects. - /// - Throws: An error if parsing fails. - private func parsePublications(_ doc: Document, authorID: GoogleScholarID) throws -> [Publication] { - var publications: [Publication] = [] - let rows = try doc.select(".gsc_a_tr") - for row in rows { - guard let titleElement = try row.select(".gsc_a_at").first(), - let title = try? titleElement.text(), - let link = try? titleElement.attr("href"), - let year = try? row.select(".gsc_a_y span").text(), - let citationsText = try? row.select(".gsc_a_c a").text() else { - continue - } - - let id = extractPublicationID(from: link) - let citations = citationsText.isEmpty ? "0" : citationsText - let publication = Publication(id: id, authorId: authorID, title: title, year: year, link: "https://scholar.google.com" + link, citations: citations) - publications.append(publication) - } - - return publications + articleCache.setObject(article, forKey: cacheKey) + return article } - + /// Fetches the author's details such as name, affiliation, and picture URL from Google Scholar. /// /// - Parameter scholarID: The Google Scholar author ID. @@ -203,14 +193,7 @@ public class GoogleScholarFetcher { throw NSError(domain: "Invalid URL Components", code: 0, userInfo: nil) } - var request = URLRequest(url: url) - request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent") - request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer") - for (header, value) in Constants.headers { - request.addValue(value, forHTTPHeaderField: header) - } - applyCookies(to: &request) - + let request = configureRequest(with: url) let (data, _) = try await session.data(for: request) guard let html = String(data: data, encoding: .utf8) else { @@ -219,7 +202,7 @@ public class GoogleScholarFetcher { return try parseAuthorDetails(from: html, id: scholarID) } - + /// Fetches the total number of citations and publications for a given author from Google Scholar. /// /// - Parameters: @@ -248,9 +231,51 @@ public class GoogleScholarFetcher { return AuthorMetrics(citations: totalCitations, publications: totalPublications) } - - // MARK: Private Methods - + + // MARK: - Private Methods + + /// Configures a `URLRequest` with common headers and cookies. + /// + /// - Parameter url: The `URL` for the request. + /// - Returns: A configured `URLRequest`. + private func configureRequest(with url: URL) -> URLRequest { + var request = URLRequest(url: url) + request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent") + request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer") + for (header, value) in Constants.headers { + request.addValue(value, forHTTPHeaderField: header) + } + applyCookies(to: &request) + return request + } + + /// Parses the publication data from the HTML document. + /// + /// - Parameter doc: The HTML document to parse. + /// - Returns: An array of `Publication` objects. + /// - Throws: An error if parsing fails. + private func parsePublications(_ doc: Document, authorID: GoogleScholarID) throws -> [Publication] { + var publications: [Publication] = [] + let rows = try doc.select(".gsc_a_tr") + + for row in rows { + guard let titleElement = try row.select(".gsc_a_at").first(), + let title = try? titleElement.text(), + let link = try? titleElement.attr("href"), + let year = try? row.select(".gsc_a_y span").text(), + let citationsText = try? row.select(".gsc_a_c a").text() else { + continue + } + + let id = extractPublicationID(from: link) + let citations = citationsText.isEmpty ? "0" : citationsText + let publication = Publication(id: id, authorId: authorID, title: title, year: year, link: "https://scholar.google.com" + link, citations: citations) + publications.append(publication) + } + + return publications + } + /// Parses the article details from the HTML document. /// /// - Parameter doc: The HTML document to parse. @@ -267,7 +292,24 @@ public class GoogleScholarFetcher { return Article(title: title, authors: authors, publicationDate: publicationDate, publication: publication, description: description, totalCitations: totalCitations) } - + + /// Parses the author's details from the HTML string. + /// + /// - Parameters: + /// - html: The HTML string to parse. + /// - id: The Google Scholar author ID. + /// - Returns: A `Author` object containing the author's details. + /// - Throws: An error if parsing fails. + private func parseAuthorDetails(from html: String, id: GoogleScholarID) throws -> Author { + let doc: Document = try SwiftSoup.parse(html) + + let name = try doc.select("#gsc_prf_in").text() + let affiliation = try doc.select(".gsc_prf_il").first()?.text() ?? "" + let pictureURL = try doc.select("#gsc_prf_pua img").attr("src") + + return Author(id: id, name: name, affiliation: affiliation, pictureURL: pictureURL) + } + /// Selects the value from the specified index in the document. /// /// - Parameters: @@ -286,7 +328,7 @@ public class GoogleScholarFetcher { } return defaultValue } - + /// Selects the total number of citations from the document. /// /// - Parameter doc: The HTML document. @@ -301,7 +343,7 @@ public class GoogleScholarFetcher { } return "" } - + /// Extracts a number from a string. /// /// - Parameter text: The string containing the number. @@ -313,7 +355,7 @@ public class GoogleScholarFetcher { } return nil } - + /// Extracts the publication ID from the link. /// /// - Parameter link: The link containing the publication ID. @@ -326,25 +368,10 @@ public class GoogleScholarFetcher { } return "" } - - /// Parses the author's details from the HTML string. + + /// Helper function to update cookies dynamically. /// - /// - Parameters: - /// - html: The HTML string to parse. - /// - id: The Google Scholar author ID. - /// - Returns: A `Author` object containing the author's details. - /// - Throws: An error if parsing fails. - private func parseAuthorDetails(from html: String, id: GoogleScholarID) throws -> Author { - let doc: Document = try SwiftSoup.parse(html) - - let name = try doc.select("#gsc_prf_in").text() - let affiliation = try doc.select(".gsc_prf_il").first()?.text() ?? "" - let pictureURL = try doc.select("#gsc_prf_pua img").attr("src") - - return Author(id: id, name: name, affiliation: affiliation, pictureURL: pictureURL) - } - - // Helper function to update cookies dynamically + /// - Parameter response: The `HTTPURLResponse` from which to extract cookies. private func updateCookies(from response: HTTPURLResponse) { if let headerFields = response.allHeaderFields as? [String: String], let url = response.url { @@ -354,8 +381,10 @@ public class GoogleScholarFetcher { } } } - - // Helper function to apply cookies to the request + + /// Helper function to apply cookies to the request. + /// + /// - Parameter request: The `URLRequest` to which cookies will be added. private func applyCookies(to request: inout URLRequest) { let cookieString = Constants.cookies.map { "\($0.key)=\($0.value)" }.joined(separator: "; ") request.addValue(cookieString, forHTTPHeaderField: "Cookie") diff --git a/Sources/GoogleScholarSwift/Models/Article.swift b/Sources/GoogleScholarSwift/Models/Article.swift index a707436..84e4866 100644 --- a/Sources/GoogleScholarSwift/Models/Article.swift +++ b/Sources/GoogleScholarSwift/Models/Article.swift @@ -1,7 +1,8 @@ import Foundation -/// Model for article details. -public struct Article: Codable, Hashable, Identifiable, Equatable, CustomStringConvertible { +/// Class for article details. +public class Article: Codable, Hashable, Identifiable, Equatable, CustomStringConvertible { + /// The unique identifier for the article. public let id: String /// The title of the article. @@ -40,4 +41,10 @@ public struct Article: Codable, Hashable, Identifiable, Equatable, CustomStringC public var localizedDescription: String { return "Article(id: \(id), title: \(title), authors: \(authors), publicationDate: \(publicationDate), publication: \(publication), description: \(description), totalCitations: \(totalCitations))" } + + public static func == (lhs: Article, rhs: Article) -> Bool { + return lhs.description == rhs.description + } + + public func hash(into hasher: inout Hasher) {} } diff --git a/Sources/GoogleScholarSwift/Models/AuthorMetrics.swift b/Sources/GoogleScholarSwift/Models/AuthorMetrics.swift index 3700e07..b8c2136 100644 --- a/Sources/GoogleScholarSwift/Models/AuthorMetrics.swift +++ b/Sources/GoogleScholarSwift/Models/AuthorMetrics.swift @@ -1,6 +1,6 @@ import Foundation -/// A struct representing the total citations and publications for a given author. +/// Model for the total citations and publications for a given author. public struct AuthorMetrics: Codable, Hashable, Equatable, CustomStringConvertible { /// The total number of citations across all fetched publications. diff --git a/Sources/GoogleScholarSwift/Models/GoogleScholarCacheConfig.swift b/Sources/GoogleScholarSwift/Models/GoogleScholarCacheConfig.swift new file mode 100644 index 0000000..0a17289 --- /dev/null +++ b/Sources/GoogleScholarSwift/Models/GoogleScholarCacheConfig.swift @@ -0,0 +1,37 @@ +import Foundation +import SwiftSoup + +/// A configuration object for the cache settings in `GoogleScholarFetcher`. +public struct GoogleScholarCacheConfig { + public let publicationCountLimit: Int + public let publicationTotalCostLimit: Int + public let articleCountLimit: Int + public let articleTotalCostLimit: Int + + /// Default cache configuration. + public static let `default` = GoogleScholarCacheConfig( + publicationCountLimit: 100, + publicationTotalCostLimit: 1024 * 1024 * 10, + articleCountLimit: 100, + articleTotalCostLimit: 1024 * 1024 * 5 + ) + + /// Initializes a new cache configuration. + /// + /// - Parameters: + /// - publicationCountLimit: Maximum number of publications to cache. + /// - publicationTotalCostLimit: Maximum total cost of publications in the cache. + /// - articleCountLimit: Maximum number of articles to cache. + /// - articleTotalCostLimit: Maximum total cost of articles in the cache. + public init( + publicationCountLimit: Int, + publicationTotalCostLimit: Int, + articleCountLimit: Int, + articleTotalCostLimit: Int + ) { + self.publicationCountLimit = publicationCountLimit + self.publicationTotalCostLimit = publicationTotalCostLimit + self.articleCountLimit = articleCountLimit + self.articleTotalCostLimit = articleTotalCostLimit + } +} diff --git a/Sources/GoogleScholarSwift/Models/GoogleScholarID.swift b/Sources/GoogleScholarSwift/Models/GoogleScholarID.swift index 32c60d6..ae8c062 100644 --- a/Sources/GoogleScholarSwift/Models/GoogleScholarID.swift +++ b/Sources/GoogleScholarSwift/Models/GoogleScholarID.swift @@ -1,6 +1,6 @@ import Foundation -/// Type representing a Google Scholar ID. +/// Model type representing a Google Scholar ID. public struct GoogleScholarID: Codable, Hashable, Equatable, CustomStringConvertible { public let value: String