Skip to content

Commit

Permalink
added NSCache
Browse files Browse the repository at this point in the history
  • Loading branch information
ezefranca committed Aug 12, 2024
1 parent 35e450a commit 4e53fd6
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 93 deletions.
207 changes: 118 additions & 89 deletions Sources/GoogleScholarSwift/Impl/GoogleScholarFetcher.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,34 @@ import SwiftSoup
/// A class responsible for fetching data from Google Scholar.
public class GoogleScholarFetcher {
private let session: URLSession
private var publicationCache: [GoogleScholarID: [Publication]] = [:]

// MARK: Public Methods

/// Initializes a new instance of `GoogleScholarFetcher` with a custom URL session.
private let publicationCache = NSCache<NSString, NSArray>()
private let articleCache = NSCache<NSString, Article>()

// MARK: - Initializer

/// Initializes a new instance of `GoogleScholarFetcher` with a custom URL session and cache configuration.
///
/// - Parameter session: A custom URL session. Defaults to `.shared`.
public init(session: URLSession = .shared) {
/// - Parameters:
/// - session: A custom URL session. Defaults to `.shared`.
/// - cacheConfig: The configuration for the cache. Defaults to `.default`.
public init(session: URLSession = .shared, cacheConfig: GoogleScholarCacheConfig = .default) {
self.session = session
self.configureCache(with: cacheConfig)
}


/// Configures the NSCache with the provided configuration.
///
/// - Parameter cacheConfig: The configuration for the cache.
private func configureCache(with cacheConfig: GoogleScholarCacheConfig) {
publicationCache.countLimit = cacheConfig.publicationCountLimit
publicationCache.totalCostLimit = cacheConfig.publicationTotalCostLimit

articleCache.countLimit = cacheConfig.articleCountLimit
articleCache.totalCostLimit = cacheConfig.articleTotalCostLimit
}

// MARK: - Public Methods

/// Fetches all publications for a given author from Google Scholar.
///
/// - Parameters:
Expand All @@ -35,6 +52,12 @@ public class GoogleScholarFetcher {
fetchQuantity: FetchQuantity = .all,
sortBy: SortBy = .cited
) async throws -> [Publication] {

let cacheKey = "\(authorID.value)-\(fetchQuantity)-\(sortBy.rawValue)" as NSString
if let cachedPublications = publicationCache.object(forKey: cacheKey) as? [Publication] {
return cachedPublications
}

var allPublications: [Publication] = []
var startIndex = 0
let pageSize = 100
Expand Down Expand Up @@ -64,14 +87,7 @@ public class GoogleScholarFetcher {
throw NSError(domain: "Invalid URL Components", code: 0, userInfo: nil)
}

var request = URLRequest(url: url)
request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent")
request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer")
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
applyCookies(to: &request)

let request = configureRequest(with: url)
let (data, _) = try await session.data(for: request)

guard let html = String(data: data, encoding: .utf8) else {
Expand Down Expand Up @@ -111,9 +127,10 @@ public class GoogleScholarFetcher {
}
}

publicationCache.setObject(mutablePublications as NSArray, forKey: cacheKey)
return mutablePublications
}

/// Fetches the detailed information for a specific article.
///
/// - Parameters:
Expand All @@ -129,18 +146,16 @@ public class GoogleScholarFetcher {
/// print(article)
/// ```
public func fetchArticle(articleLink: ArticleLink) async throws -> Article {
guard let url = URL(string: articleLink.value) else {
throw NSError(domain: "Invalid URL", code: 0, userInfo: nil)
let cacheKey = articleLink.value as NSString
if let cachedArticle = articleCache.object(forKey: cacheKey) {
return cachedArticle
}

var request = URLRequest(url: url)
request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent")
request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer")
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
guard let url = URL(string: articleLink.value) else {
throw NSError(domain: "Invalid URL", code: 0, userInfo: nil)
}
applyCookies(to: &request)

let request = configureRequest(with: url)
let (data, _) = try await session.data(for: request)

guard let html = String(data: data, encoding: .utf8) else {
Expand All @@ -149,36 +164,11 @@ public class GoogleScholarFetcher {

let doc: Document = try SwiftSoup.parse(html)
let article = try self.parseArticle(doc)
return article
}

/// Parses the publication data from the HTML document.
///
/// - Parameter doc: The HTML document to parse.
/// - Returns: An array of `Publication` objects.
/// - Throws: An error if parsing fails.
private func parsePublications(_ doc: Document, authorID: GoogleScholarID) throws -> [Publication] {
var publications: [Publication] = []
let rows = try doc.select(".gsc_a_tr")

for row in rows {
guard let titleElement = try row.select(".gsc_a_at").first(),
let title = try? titleElement.text(),
let link = try? titleElement.attr("href"),
let year = try? row.select(".gsc_a_y span").text(),
let citationsText = try? row.select(".gsc_a_c a").text() else {
continue
}

let id = extractPublicationID(from: link)
let citations = citationsText.isEmpty ? "0" : citationsText
let publication = Publication(id: id, authorId: authorID, title: title, year: year, link: "https://scholar.google.com" + link, citations: citations)
publications.append(publication)
}

return publications
articleCache.setObject(article, forKey: cacheKey)
return article
}

/// Fetches the author's details such as name, affiliation, and picture URL from Google Scholar.
///
/// - Parameter scholarID: The Google Scholar author ID.
Expand All @@ -203,14 +193,7 @@ public class GoogleScholarFetcher {
throw NSError(domain: "Invalid URL Components", code: 0, userInfo: nil)
}

var request = URLRequest(url: url)
request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent")
request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer")
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
applyCookies(to: &request)

let request = configureRequest(with: url)
let (data, _) = try await session.data(for: request)

guard let html = String(data: data, encoding: .utf8) else {
Expand All @@ -219,7 +202,7 @@ public class GoogleScholarFetcher {

return try parseAuthorDetails(from: html, id: scholarID)
}

/// Fetches the total number of citations and publications for a given author from Google Scholar.
///
/// - Parameters:
Expand Down Expand Up @@ -248,9 +231,51 @@ public class GoogleScholarFetcher {

return AuthorMetrics(citations: totalCitations, publications: totalPublications)
}

// MARK: Private Methods


// MARK: - Private Methods

/// Configures a `URLRequest` with common headers and cookies.
///
/// - Parameter url: The `URL` for the request.
/// - Returns: A configured `URLRequest`.
private func configureRequest(with url: URL) -> URLRequest {
var request = URLRequest(url: url)
request.addValue(Constants.randomUserAgent(), forHTTPHeaderField: "User-Agent")
request.addValue("https://scholar.google.com/", forHTTPHeaderField: "Referer")
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
applyCookies(to: &request)
return request
}

/// Parses the publication data from the HTML document.
///
/// - Parameter doc: The HTML document to parse.
/// - Returns: An array of `Publication` objects.
/// - Throws: An error if parsing fails.
private func parsePublications(_ doc: Document, authorID: GoogleScholarID) throws -> [Publication] {
var publications: [Publication] = []
let rows = try doc.select(".gsc_a_tr")

for row in rows {
guard let titleElement = try row.select(".gsc_a_at").first(),
let title = try? titleElement.text(),
let link = try? titleElement.attr("href"),
let year = try? row.select(".gsc_a_y span").text(),
let citationsText = try? row.select(".gsc_a_c a").text() else {
continue
}

let id = extractPublicationID(from: link)
let citations = citationsText.isEmpty ? "0" : citationsText
let publication = Publication(id: id, authorId: authorID, title: title, year: year, link: "https://scholar.google.com" + link, citations: citations)
publications.append(publication)
}

return publications
}

/// Parses the article details from the HTML document.
///
/// - Parameter doc: The HTML document to parse.
Expand All @@ -267,7 +292,24 @@ public class GoogleScholarFetcher {

return Article(title: title, authors: authors, publicationDate: publicationDate, publication: publication, description: description, totalCitations: totalCitations)
}


/// Parses the author's details from the HTML string.
///
/// - Parameters:
/// - html: The HTML string to parse.
/// - id: The Google Scholar author ID.
/// - Returns: A `Author` object containing the author's details.
/// - Throws: An error if parsing fails.
private func parseAuthorDetails(from html: String, id: GoogleScholarID) throws -> Author {
let doc: Document = try SwiftSoup.parse(html)

let name = try doc.select("#gsc_prf_in").text()
let affiliation = try doc.select(".gsc_prf_il").first()?.text() ?? ""
let pictureURL = try doc.select("#gsc_prf_pua img").attr("src")

return Author(id: id, name: name, affiliation: affiliation, pictureURL: pictureURL)
}

/// Selects the value from the specified index in the document.
///
/// - Parameters:
Expand All @@ -286,7 +328,7 @@ public class GoogleScholarFetcher {
}
return defaultValue
}

/// Selects the total number of citations from the document.
///
/// - Parameter doc: The HTML document.
Expand All @@ -301,7 +343,7 @@ public class GoogleScholarFetcher {
}
return ""
}

/// Extracts a number from a string.
///
/// - Parameter text: The string containing the number.
Expand All @@ -313,7 +355,7 @@ public class GoogleScholarFetcher {
}
return nil
}

/// Extracts the publication ID from the link.
///
/// - Parameter link: The link containing the publication ID.
Expand All @@ -326,25 +368,10 @@ public class GoogleScholarFetcher {
}
return ""
}
/// Parses the author's details from the HTML string.

/// Helper function to update cookies dynamically.
///
/// - Parameters:
/// - html: The HTML string to parse.
/// - id: The Google Scholar author ID.
/// - Returns: A `Author` object containing the author's details.
/// - Throws: An error if parsing fails.
private func parseAuthorDetails(from html: String, id: GoogleScholarID) throws -> Author {
let doc: Document = try SwiftSoup.parse(html)

let name = try doc.select("#gsc_prf_in").text()
let affiliation = try doc.select(".gsc_prf_il").first()?.text() ?? ""
let pictureURL = try doc.select("#gsc_prf_pua img").attr("src")

return Author(id: id, name: name, affiliation: affiliation, pictureURL: pictureURL)
}

// Helper function to update cookies dynamically
/// - Parameter response: The `HTTPURLResponse` from which to extract cookies.
private func updateCookies(from response: HTTPURLResponse) {
if let headerFields = response.allHeaderFields as? [String: String],
let url = response.url {
Expand All @@ -354,8 +381,10 @@ public class GoogleScholarFetcher {
}
}
}

// Helper function to apply cookies to the request

/// Helper function to apply cookies to the request.
///
/// - Parameter request: The `URLRequest` to which cookies will be added.
private func applyCookies(to request: inout URLRequest) {
let cookieString = Constants.cookies.map { "\($0.key)=\($0.value)" }.joined(separator: "; ")
request.addValue(cookieString, forHTTPHeaderField: "Cookie")
Expand Down
11 changes: 9 additions & 2 deletions Sources/GoogleScholarSwift/Models/Article.swift
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import Foundation

/// Model for article details.
public struct Article: Codable, Hashable, Identifiable, Equatable, CustomStringConvertible {
/// Class for article details.
public class Article: Codable, Hashable, Identifiable, Equatable, CustomStringConvertible {

/// The unique identifier for the article.
public let id: String
/// The title of the article.
Expand Down Expand Up @@ -40,4 +41,10 @@ public struct Article: Codable, Hashable, Identifiable, Equatable, CustomStringC
public var localizedDescription: String {
return "Article(id: \(id), title: \(title), authors: \(authors), publicationDate: \(publicationDate), publication: \(publication), description: \(description), totalCitations: \(totalCitations))"
}

public static func == (lhs: Article, rhs: Article) -> Bool {
return lhs.description == rhs.description
}

public func hash(into hasher: inout Hasher) {}
}
2 changes: 1 addition & 1 deletion Sources/GoogleScholarSwift/Models/AuthorMetrics.swift
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import Foundation

/// A struct representing the total citations and publications for a given author.
/// Model for the total citations and publications for a given author.
public struct AuthorMetrics: Codable, Hashable, Equatable, CustomStringConvertible {

/// The total number of citations across all fetched publications.
Expand Down
37 changes: 37 additions & 0 deletions Sources/GoogleScholarSwift/Models/GoogleScholarCacheConfig.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import Foundation
import SwiftSoup

/// A configuration object for the cache settings in `GoogleScholarFetcher`.
public struct GoogleScholarCacheConfig {
public let publicationCountLimit: Int
public let publicationTotalCostLimit: Int
public let articleCountLimit: Int
public let articleTotalCostLimit: Int

/// Default cache configuration.
public static let `default` = GoogleScholarCacheConfig(
publicationCountLimit: 100,
publicationTotalCostLimit: 1024 * 1024 * 10,
articleCountLimit: 100,
articleTotalCostLimit: 1024 * 1024 * 5
)

/// Initializes a new cache configuration.
///
/// - Parameters:
/// - publicationCountLimit: Maximum number of publications to cache.
/// - publicationTotalCostLimit: Maximum total cost of publications in the cache.
/// - articleCountLimit: Maximum number of articles to cache.
/// - articleTotalCostLimit: Maximum total cost of articles in the cache.
public init(
publicationCountLimit: Int,
publicationTotalCostLimit: Int,
articleCountLimit: Int,
articleTotalCostLimit: Int
) {
self.publicationCountLimit = publicationCountLimit
self.publicationTotalCostLimit = publicationTotalCostLimit
self.articleCountLimit = articleCountLimit
self.articleTotalCostLimit = articleTotalCostLimit
}
}
2 changes: 1 addition & 1 deletion Sources/GoogleScholarSwift/Models/GoogleScholarID.swift
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import Foundation

/// Type representing a Google Scholar ID.
/// Model type representing a Google Scholar ID.
public struct GoogleScholarID: Codable, Hashable, Equatable, CustomStringConvertible {
public let value: String

Expand Down

0 comments on commit 4e53fd6

Please sign in to comment.