Skip to content

Commit fcd51ff

Browse files
authored
Merge pull request #22 from compscidr/jason/disk-caching
Working disk caching
2 parents a69ccad + 9c7be89 commit fcd51ff

File tree

6 files changed

+153
-35
lines changed

6 files changed

+153
-35
lines changed

Diff for: .gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@ go-scholar
1717

1818
.idea
1919
scholar-example/scholar-example
20+
articles.json
21+
profiles.json

Diff for: README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This tool is inspired by [scholar.py](https://github.com/ckreibich/scholar.py)
88
```
99
import "github.com/compscidr/scholar"
1010
11-
sch := scholar.New()
11+
sch := scholar.New("profiles.json", "articles.json")
1212
articles := sch.QueryProfile("SbUmSEAAAAAJ", 1)
1313
1414
for _, article := range articles {
@@ -23,11 +23,11 @@ Working:
2323
* Caches the profile for a day, and articles for a week (need to confirm this is working)
2424
* This is in memory, so if the program is restarted, the cache is lost
2525
* Configurable limit to number of articles to query in one go
26+
* On-disk caching of the profile and articles to avoid hitting the rate limit
2627

2728
## TODO:
2829
* Pagination of articles
2930
* Add throttling to avoid hitting the rate limit (figure out what the limit is)
30-
* Add on-disk caching so that if program restarts the cache is not lost
3131

3232
## Possible throttle info:
3333
https://stackoverflow.com/questions/60271587/how-long-is-the-error-429-toomanyrequests-cooldown

Diff for: scholar-example/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
articles.json
2+
profile.json
3+

Diff for: scholar-example/main.go

+15-3
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ func main() {
2323
user := *userPtr
2424
limit := *limitPtr
2525

26-
sch := scholar.New()
26+
sch := scholar.New("profile.json", "articles.json")
2727
//articles := sch.QueryProfileDumpResponse(user, limit, true)
2828
//articles := sch.QueryProfile(user, limit)
29-
articles := sch.QueryProfileWithCache(user, limit)
29+
articles := sch.QueryProfileWithMemoryCache(user, limit)
3030

3131
if len(articles) == 0 {
3232
fmt.Println("Not found")
@@ -37,7 +37,7 @@ func main() {
3737
fmt.Println(article)
3838
}
3939

40-
cachedArticles := sch.QueryProfileWithCache(user, limit)
40+
cachedArticles := sch.QueryProfileWithMemoryCache(user, limit)
4141
if len(articles) == 0 {
4242
fmt.Println("Not found")
4343
return
@@ -46,4 +46,16 @@ func main() {
4646
for _, article := range cachedArticles {
4747
fmt.Println(article)
4848
}
49+
50+
sch.SaveCache("profile.json", "articles.json")
51+
sch2 := scholar.New("profile.json", "articles.json")
52+
cachedArticles2 := sch2.QueryProfileWithMemoryCache(user, limit)
53+
if len(articles) == 0 {
54+
fmt.Println("Not found")
55+
return
56+
}
57+
58+
for _, article := range cachedArticles2 {
59+
fmt.Println(article)
60+
}
4961
}

Diff for: scholar.go

+130-29
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@ package go_scholar
22

33
import (
44
"bytes"
5+
"encoding/json"
56
"fmt"
67
"github.com/PuerkitoBio/goquery"
7-
cmap "github.com/orcaman/concurrent-map/v2"
88
"io"
99
"log"
1010
"net/http"
11+
"os"
1112
"strconv"
1213
"strings"
14+
"sync"
1315
"time"
1416
)
1517

@@ -46,49 +48,145 @@ type Profile struct {
4648
}
4749

4850
type Scholar struct {
49-
articles cmap.ConcurrentMap[string, Article] // map of articles by URL
50-
profile cmap.ConcurrentMap[string, Profile] // map of profile by User string
51+
articles sync.Map // map of articles by URL
52+
profile sync.Map // map of profile by User string
5153
}
5254

53-
func New() Scholar {
54-
return Scholar{
55-
articles: cmap.New[Article](),
56-
profile: cmap.New[Profile](),
55+
func New(profileCache string, articleCache string) *Scholar {
56+
57+
profileFile, err := os.Open(profileCache)
58+
if err != nil {
59+
println("Error opening profile cache file: " + profileCache + " - creating new cache")
60+
return &Scholar{}
61+
}
62+
defer func(file *os.File) {
63+
err := file.Close()
64+
if err != nil {
65+
println("Error closing profile cache file: " + profileCache)
66+
}
67+
}(profileFile)
68+
profileDecoder := json.NewDecoder(profileFile)
69+
var regularProfileMap map[string]Profile
70+
err = profileDecoder.Decode(&regularProfileMap)
71+
if err != nil {
72+
println("Error decoding profile file: " + profileCache + " - creating new cache")
73+
return &Scholar{}
74+
}
75+
76+
articleFile, err := os.Open(articleCache)
77+
if err != nil {
78+
println("Error opening article cache file: " + articleCache + " - creating new cache")
79+
return &Scholar{}
80+
}
81+
defer func(file *os.File) {
82+
err := file.Close()
83+
if err != nil {
84+
println("Error closing article cache file: " + articleCache)
85+
}
86+
}(articleFile)
87+
articleDecoder := json.NewDecoder(articleFile)
88+
var regularArticleMap map[string]Article
89+
err = articleDecoder.Decode(&regularArticleMap)
90+
if err != nil {
91+
println("Error decoding article cache file: " + articleCache + " - creating new cache")
92+
return &Scholar{}
93+
}
94+
95+
sch := Scholar{}
96+
97+
// convert the regular maps to sync maps
98+
for key, value := range regularProfileMap {
99+
sch.profile.Store(key, value)
100+
}
101+
for key, value := range regularArticleMap {
102+
sch.articles.Store(key, value)
103+
}
104+
105+
return &sch
106+
}
107+
108+
func (sch *Scholar) SaveCache(profileCache string, articleCache string) {
109+
profileFile, err := os.Create(profileCache)
110+
if err != nil {
111+
println("Error opening profile cache file: " + profileCache)
112+
return
113+
}
114+
defer func(file *os.File) {
115+
err := file.Close()
116+
if err != nil {
117+
println("Error closing profile cache file: " + profileCache)
118+
}
119+
}(profileFile)
120+
profileEncoder := json.NewEncoder(profileFile)
121+
regularProfileMap := make(map[string]interface{})
122+
sch.profile.Range(func(key, value interface{}) bool {
123+
regularProfileMap[key.(string)] = value
124+
return true
125+
})
126+
err = profileEncoder.Encode(regularProfileMap)
127+
if err != nil {
128+
println("Error encoding profile cache file: " + profileCache)
129+
}
130+
131+
articleFile, err := os.Create(articleCache)
132+
if err != nil {
133+
println("Error opening article cache file: " + articleCache)
134+
return
135+
}
136+
defer func(file *os.File) {
137+
err := file.Close()
138+
if err != nil {
139+
println("Error closing profile cache file: " + articleCache)
140+
}
141+
}(articleFile)
142+
articleEncoder := json.NewEncoder(articleFile)
143+
regularArticleMap := make(map[string]interface{})
144+
sch.articles.Range(func(key, value interface{}) bool {
145+
regularArticleMap[key.(string)] = value
146+
return true
147+
})
148+
err = articleEncoder.Encode(regularArticleMap)
149+
if err != nil {
150+
println("Error encoding cache file: " + articleCache)
57151
}
58152
}
59153

60154
func (a Article) String() string {
61155
return "Article(\n Title=" + a.Title + "\n authors=" + a.Authors + "\n ScholarURL=" + a.ScholarURL + "\n Year=" + strconv.Itoa(a.Year) + "\n Month=" + strconv.Itoa(a.Month) + "\n Day=" + strconv.Itoa(a.Day) + "\n NumCitations=" + strconv.Itoa(a.NumCitations) + "\n Articles=" + strconv.Itoa(a.Articles) + "\n Description=" + a.Description + "\n PdfURL=" + a.PdfURL + "\n Journal=" + a.Journal + "\n Volume=" + a.Volume + "\n Pages=" + a.Pages + "\n Publisher=" + a.Publisher + "\n scholarCitedByURL=" + strings.Join(a.ScholarCitedByURLs, ", ") + "\n scholarVersionsURL=" + strings.Join(a.ScholarVersionsURLs, ", ") + "\n scholarRelatedURL=" + strings.Join(a.ScholarRelatedURLs, ", ") + "\n LastRetrieved=" + a.LastRetrieved.String() + "\n)"
62156
}
63157

64-
func (sch Scholar) QueryProfile(user string, limit int) []Article {
158+
func (sch *Scholar) QueryProfile(user string, limit int) []Article {
65159
return sch.QueryProfileDumpResponse(user, true, limit, false)
66160
}
67161

68-
func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
69-
if sch.profile.Has(user) {
70-
p, _ := sch.profile.Get(user)
71-
lastAccess := p.LastRetrieved
162+
func (sch *Scholar) QueryProfileWithMemoryCache(user string, limit int) []Article {
163+
164+
profileResult, profileOk := sch.profile.Load(user)
165+
if profileOk {
166+
profile := profileResult.(Profile)
167+
lastAccess := profile.LastRetrieved
72168
if (time.Now().Sub(lastAccess)).Seconds() > MAX_TIME_PROFILE.Seconds() {
73169
println("Profile cache expired for User: " + user)
74-
sch.profile.Remove(user)
170+
sch.profile.Delete(user)
75171
articles := sch.QueryProfileDumpResponse(user, true, limit, false)
76172
var articleList []string
77173
for _, article := range articles {
78174
articleList = append(articleList, article.ScholarURL)
79175
}
80-
sch.profile.Set(user, Profile{User: user, LastRetrieved: time.Now(), Articles: articleList})
176+
newProfile := Profile{User: user, LastRetrieved: time.Now(), Articles: articleList}
177+
sch.profile.Store(user, newProfile)
81178
} else {
82179
println("Profile cache hit for User: " + user)
83180
// cache hit, return the Articles
84181
articles := make([]Article, 0)
85-
for _, articleURL := range p.Articles {
86-
if sch.articles.Has(articleURL) {
87-
cacheArticle, _ := sch.articles.Get(articleURL)
182+
for _, articleURL := range profile.Articles {
183+
articleResult, articleOk := sch.articles.Load(articleURL)
184+
if articleOk {
185+
cacheArticle := articleResult.(Article)
88186
if (time.Now().Sub(cacheArticle.LastRetrieved)).Seconds() > MAX_TIME_ARTICLE.Seconds() {
89187
println("Cache expired for article: " + articleURL + "\nLast Retrieved: " + cacheArticle.LastRetrieved.String() + "\nDifference: " + time.Now().Sub(cacheArticle.LastRetrieved).String())
90188
article := sch.QueryArticle(articleURL, Article{}, false)
91-
sch.articles.Set(articleURL, article)
189+
sch.articles.Store(articleURL, article)
92190
articles = append(articles, article)
93191
} else {
94192
println("Cache hit for article: " + articleURL)
@@ -99,7 +197,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
99197
println("Cache miss for article: " + articleURL)
100198
article := sch.QueryArticle(articleURL, Article{}, false)
101199
articles = append(articles, article)
102-
sch.articles.Set(articleURL, article)
200+
sch.articles.Store(articleURL, article)
103201
}
104202
}
105203
return articles
@@ -112,7 +210,8 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
112210
for _, article := range articles {
113211
articleList = append(articleList, article.ScholarURL)
114212
}
115-
sch.profile.Set(user, Profile{User: user, LastRetrieved: time.Now(), Articles: articleList})
213+
newProfile := Profile{User: user, LastRetrieved: time.Now(), Articles: articleList}
214+
sch.profile.Store(user, newProfile)
116215
return articles
117216
}
118217

@@ -127,7 +226,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
127226
// want to get updated information from the profile page only to save requests
128227
//
129228
// if dumpResponse is true, it will print the response to stdout (useful for debugging)
130-
func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, limit int, dumpResponse bool) []Article {
229+
func (sch *Scholar) QueryProfileDumpResponse(user string, queryArticles bool, limit int, dumpResponse bool) []Article {
131230
var articles []Article
132231
client := &http.Client{}
133232

@@ -171,26 +270,27 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
171270
article.NumCitations, _ = strconv.Atoi(s.Find(".gsc_a_c").Children().First().Text())
172271

173272
if queryArticles {
174-
if sch.articles.Has(BaseURL + tempURL) {
273+
articleResult, articleOk := sch.articles.Load(BaseURL + tempURL)
274+
if articleOk {
175275
// hit the cache
176-
cacheArticle, _ := sch.articles.Get(BaseURL + tempURL)
276+
cacheArticle := articleResult.(Article)
177277
if (time.Now().Sub(article.LastRetrieved)).Seconds() > MAX_TIME_ARTICLE.Seconds() {
178278
println("Cache expired for article" + BaseURL + tempURL + "\nLast Retrieved: " + cacheArticle.LastRetrieved.String() + "\nDifference: " + time.Now().Sub(cacheArticle.LastRetrieved).String())
179279
// expired cache entry, replace it
180-
sch.articles.Remove(BaseURL + tempURL)
280+
sch.articles.Delete(BaseURL + tempURL)
181281
article = sch.QueryArticle(BaseURL+tempURL, article, dumpResponse)
182-
sch.articles.Set(BaseURL+tempURL, article)
282+
sch.articles.Store(BaseURL+tempURL, article)
183283
} else {
184284
println("Cache hit for article" + BaseURL + tempURL)
185285
// not expired, update any new information
186286
cacheArticle.NumCitations = article.NumCitations // update the citations since thats all that might change
187287
article = cacheArticle
188-
sch.articles.Set(BaseURL+tempURL, article)
288+
sch.articles.Store(BaseURL+tempURL, article)
189289
}
190290
} else {
191291
println("Cache miss for article" + BaseURL + tempURL)
192292
article = sch.QueryArticle(BaseURL+tempURL, article, dumpResponse)
193-
sch.articles.Set(BaseURL+tempURL, article)
293+
sch.articles.Store(BaseURL+tempURL, article)
194294
}
195295
}
196296
articles = append(articles, article)
@@ -199,7 +299,7 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
199299
return articles
200300
}
201301

202-
func (sch Scholar) QueryArticle(url string, article Article, dumpResponse bool) Article {
302+
func (sch *Scholar) QueryArticle(url string, article Article, dumpResponse bool) Article {
203303
fmt.Println("PULLING ARTICLE: " + url)
204304
article.ScholarURL = url
205305
client := &http.Client{}
@@ -274,7 +374,8 @@ func (sch Scholar) QueryArticle(url string, article Article, dumpResponse bool)
274374
article.Articles += 1
275375
articles := s.Find(".gsc_oci_value")
276376
articles.Find(".gsc_oci_merged_snippet").Each(func(i int, s *goquery.Selection) {
277-
// each one of these is an article. For an scholar-example with multiple see: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
377+
// each one of these is an article. For a scholar-example with multiple see:
378+
// https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
278379
// this seems to happen if the entry is a book and there are Articles within it
279380
s.Find(".gsc_oms_link").Each(func(i int, l *goquery.Selection) {
280381
linkText := l.Text()

Diff for: scholar_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ func TestScholarQuerier(t *testing.T) {
1515
}
1616

1717
func TestProfileQuerier(t *testing.T) {
18-
sch := New()
18+
sch := New("profiles.json", "articles.json")
1919
articles := sch.QueryProfile("SbUmSEAAAAAJ", 1)
2020
assert.NotEmpty(t, articles)
2121

0 commit comments

Comments
 (0)