@@ -2,14 +2,16 @@ package go_scholar
2
2
3
3
import (
4
4
"bytes"
5
+ "encoding/json"
5
6
"fmt"
6
7
"github.com/PuerkitoBio/goquery"
7
- cmap "github.com/orcaman/concurrent-map/v2"
8
8
"io"
9
9
"log"
10
10
"net/http"
11
+ "os"
11
12
"strconv"
12
13
"strings"
14
+ "sync"
13
15
"time"
14
16
)
15
17
@@ -46,49 +48,145 @@ type Profile struct {
46
48
}
47
49
48
50
type Scholar struct {
49
- articles cmap. ConcurrentMap [ string , Article ] // map of articles by URL
50
- profile cmap. ConcurrentMap [ string , Profile ] // map of profile by User string
51
+ articles sync. Map // map of articles by URL
52
+ profile sync. Map // map of profile by User string
51
53
}
52
54
53
- func New () Scholar {
54
- return Scholar {
55
- articles : cmap .New [Article ](),
56
- profile : cmap .New [Profile ](),
55
+ func New (profileCache string , articleCache string ) * Scholar {
56
+
57
+ profileFile , err := os .Open (profileCache )
58
+ if err != nil {
59
+ println ("Error opening profile cache file: " + profileCache + " - creating new cache" )
60
+ return & Scholar {}
61
+ }
62
+ defer func (file * os.File ) {
63
+ err := file .Close ()
64
+ if err != nil {
65
+ println ("Error closing profile cache file: " + profileCache )
66
+ }
67
+ }(profileFile )
68
+ profileDecoder := json .NewDecoder (profileFile )
69
+ var regularProfileMap map [string ]Profile
70
+ err = profileDecoder .Decode (& regularProfileMap )
71
+ if err != nil {
72
+ println ("Error decoding profile file: " + profileCache + " - creating new cache" )
73
+ return & Scholar {}
74
+ }
75
+
76
+ articleFile , err := os .Open (articleCache )
77
+ if err != nil {
78
+ println ("Error opening article cache file: " + articleCache + " - creating new cache" )
79
+ return & Scholar {}
80
+ }
81
+ defer func (file * os.File ) {
82
+ err := file .Close ()
83
+ if err != nil {
84
+ println ("Error closing article cache file: " + articleCache )
85
+ }
86
+ }(articleFile )
87
+ articleDecoder := json .NewDecoder (articleFile )
88
+ var regularArticleMap map [string ]Article
89
+ err = articleDecoder .Decode (& regularArticleMap )
90
+ if err != nil {
91
+ println ("Error decoding article cache file: " + articleCache + " - creating new cache" )
92
+ return & Scholar {}
93
+ }
94
+
95
+ sch := Scholar {}
96
+
97
+ // convert the regular maps to sync maps
98
+ for key , value := range regularProfileMap {
99
+ sch .profile .Store (key , value )
100
+ }
101
+ for key , value := range regularArticleMap {
102
+ sch .articles .Store (key , value )
103
+ }
104
+
105
+ return & sch
106
+ }
107
+
108
+ func (sch * Scholar ) SaveCache (profileCache string , articleCache string ) {
109
+ profileFile , err := os .Create (profileCache )
110
+ if err != nil {
111
+ println ("Error opening profile cache file: " + profileCache )
112
+ return
113
+ }
114
+ defer func (file * os.File ) {
115
+ err := file .Close ()
116
+ if err != nil {
117
+ println ("Error closing profile cache file: " + profileCache )
118
+ }
119
+ }(profileFile )
120
+ profileEncoder := json .NewEncoder (profileFile )
121
+ regularProfileMap := make (map [string ]interface {})
122
+ sch .profile .Range (func (key , value interface {}) bool {
123
+ regularProfileMap [key .(string )] = value
124
+ return true
125
+ })
126
+ err = profileEncoder .Encode (regularProfileMap )
127
+ if err != nil {
128
+ println ("Error encoding profile cache file: " + profileCache )
129
+ }
130
+
131
+ articleFile , err := os .Create (articleCache )
132
+ if err != nil {
133
+ println ("Error opening article cache file: " + articleCache )
134
+ return
135
+ }
136
+ defer func (file * os.File ) {
137
+ err := file .Close ()
138
+ if err != nil {
139
+ println ("Error closing profile cache file: " + articleCache )
140
+ }
141
+ }(articleFile )
142
+ articleEncoder := json .NewEncoder (articleFile )
143
+ regularArticleMap := make (map [string ]interface {})
144
+ sch .articles .Range (func (key , value interface {}) bool {
145
+ regularArticleMap [key .(string )] = value
146
+ return true
147
+ })
148
+ err = articleEncoder .Encode (regularArticleMap )
149
+ if err != nil {
150
+ println ("Error encoding cache file: " + articleCache )
57
151
}
58
152
}
59
153
60
154
func (a Article ) String () string {
61
155
return "Article(\n Title=" + a .Title + "\n authors=" + a .Authors + "\n ScholarURL=" + a .ScholarURL + "\n Year=" + strconv .Itoa (a .Year ) + "\n Month=" + strconv .Itoa (a .Month ) + "\n Day=" + strconv .Itoa (a .Day ) + "\n NumCitations=" + strconv .Itoa (a .NumCitations ) + "\n Articles=" + strconv .Itoa (a .Articles ) + "\n Description=" + a .Description + "\n PdfURL=" + a .PdfURL + "\n Journal=" + a .Journal + "\n Volume=" + a .Volume + "\n Pages=" + a .Pages + "\n Publisher=" + a .Publisher + "\n scholarCitedByURL=" + strings .Join (a .ScholarCitedByURLs , ", " ) + "\n scholarVersionsURL=" + strings .Join (a .ScholarVersionsURLs , ", " ) + "\n scholarRelatedURL=" + strings .Join (a .ScholarRelatedURLs , ", " ) + "\n LastRetrieved=" + a .LastRetrieved .String () + "\n )"
62
156
}
63
157
64
- func (sch Scholar ) QueryProfile (user string , limit int ) []Article {
158
+ func (sch * Scholar ) QueryProfile (user string , limit int ) []Article {
65
159
return sch .QueryProfileDumpResponse (user , true , limit , false )
66
160
}
67
161
68
- func (sch Scholar ) QueryProfileWithCache (user string , limit int ) []Article {
69
- if sch .profile .Has (user ) {
70
- p , _ := sch .profile .Get (user )
71
- lastAccess := p .LastRetrieved
162
+ func (sch * Scholar ) QueryProfileWithMemoryCache (user string , limit int ) []Article {
163
+
164
+ profileResult , profileOk := sch .profile .Load (user )
165
+ if profileOk {
166
+ profile := profileResult .(Profile )
167
+ lastAccess := profile .LastRetrieved
72
168
if (time .Now ().Sub (lastAccess )).Seconds () > MAX_TIME_PROFILE .Seconds () {
73
169
println ("Profile cache expired for User: " + user )
74
- sch .profile .Remove (user )
170
+ sch .profile .Delete (user )
75
171
articles := sch .QueryProfileDumpResponse (user , true , limit , false )
76
172
var articleList []string
77
173
for _ , article := range articles {
78
174
articleList = append (articleList , article .ScholarURL )
79
175
}
80
- sch .profile .Set (user , Profile {User : user , LastRetrieved : time .Now (), Articles : articleList })
176
+ newProfile := Profile {User : user , LastRetrieved : time .Now (), Articles : articleList }
177
+ sch .profile .Store (user , newProfile )
81
178
} else {
82
179
println ("Profile cache hit for User: " + user )
83
180
// cache hit, return the Articles
84
181
articles := make ([]Article , 0 )
85
- for _ , articleURL := range p .Articles {
86
- if sch .articles .Has (articleURL ) {
87
- cacheArticle , _ := sch .articles .Get (articleURL )
182
+ for _ , articleURL := range profile .Articles {
183
+ articleResult , articleOk := sch .articles .Load (articleURL )
184
+ if articleOk {
185
+ cacheArticle := articleResult .(Article )
88
186
if (time .Now ().Sub (cacheArticle .LastRetrieved )).Seconds () > MAX_TIME_ARTICLE .Seconds () {
89
187
println ("Cache expired for article: " + articleURL + "\n Last Retrieved: " + cacheArticle .LastRetrieved .String () + "\n Difference: " + time .Now ().Sub (cacheArticle .LastRetrieved ).String ())
90
188
article := sch .QueryArticle (articleURL , Article {}, false )
91
- sch .articles .Set (articleURL , article )
189
+ sch .articles .Store (articleURL , article )
92
190
articles = append (articles , article )
93
191
} else {
94
192
println ("Cache hit for article: " + articleURL )
@@ -99,7 +197,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
99
197
println ("Cache miss for article: " + articleURL )
100
198
article := sch .QueryArticle (articleURL , Article {}, false )
101
199
articles = append (articles , article )
102
- sch .articles .Set (articleURL , article )
200
+ sch .articles .Store (articleURL , article )
103
201
}
104
202
}
105
203
return articles
@@ -112,7 +210,8 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
112
210
for _ , article := range articles {
113
211
articleList = append (articleList , article .ScholarURL )
114
212
}
115
- sch .profile .Set (user , Profile {User : user , LastRetrieved : time .Now (), Articles : articleList })
213
+ newProfile := Profile {User : user , LastRetrieved : time .Now (), Articles : articleList }
214
+ sch .profile .Store (user , newProfile )
116
215
return articles
117
216
}
118
217
@@ -127,7 +226,7 @@ func (sch Scholar) QueryProfileWithCache(user string, limit int) []Article {
127
226
// want to get updated information from the profile page only to save requests
128
227
//
129
228
// if dumpResponse is true, it will print the response to stdout (useful for debugging)
130
- func (sch Scholar ) QueryProfileDumpResponse (user string , queryArticles bool , limit int , dumpResponse bool ) []Article {
229
+ func (sch * Scholar ) QueryProfileDumpResponse (user string , queryArticles bool , limit int , dumpResponse bool ) []Article {
131
230
var articles []Article
132
231
client := & http.Client {}
133
232
@@ -171,26 +270,27 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
171
270
article .NumCitations , _ = strconv .Atoi (s .Find (".gsc_a_c" ).Children ().First ().Text ())
172
271
173
272
if queryArticles {
174
- if sch .articles .Has (BaseURL + tempURL ) {
273
+ articleResult , articleOk := sch .articles .Load (BaseURL + tempURL )
274
+ if articleOk {
175
275
// hit the cache
176
- cacheArticle , _ := sch . articles . Get ( BaseURL + tempURL )
276
+ cacheArticle := articleResult .( Article )
177
277
if (time .Now ().Sub (article .LastRetrieved )).Seconds () > MAX_TIME_ARTICLE .Seconds () {
178
278
println ("Cache expired for article" + BaseURL + tempURL + "\n Last Retrieved: " + cacheArticle .LastRetrieved .String () + "\n Difference: " + time .Now ().Sub (cacheArticle .LastRetrieved ).String ())
179
279
// expired cache entry, replace it
180
- sch .articles .Remove (BaseURL + tempURL )
280
+ sch .articles .Delete (BaseURL + tempURL )
181
281
article = sch .QueryArticle (BaseURL + tempURL , article , dumpResponse )
182
- sch .articles .Set (BaseURL + tempURL , article )
282
+ sch .articles .Store (BaseURL + tempURL , article )
183
283
} else {
184
284
println ("Cache hit for article" + BaseURL + tempURL )
185
285
// not expired, update any new information
186
286
cacheArticle .NumCitations = article .NumCitations // update the citations since thats all that might change
187
287
article = cacheArticle
188
- sch .articles .Set (BaseURL + tempURL , article )
288
+ sch .articles .Store (BaseURL + tempURL , article )
189
289
}
190
290
} else {
191
291
println ("Cache miss for article" + BaseURL + tempURL )
192
292
article = sch .QueryArticle (BaseURL + tempURL , article , dumpResponse )
193
- sch .articles .Set (BaseURL + tempURL , article )
293
+ sch .articles .Store (BaseURL + tempURL , article )
194
294
}
195
295
}
196
296
articles = append (articles , article )
@@ -199,7 +299,7 @@ func (sch Scholar) QueryProfileDumpResponse(user string, queryArticles bool, lim
199
299
return articles
200
300
}
201
301
202
- func (sch Scholar ) QueryArticle (url string , article Article , dumpResponse bool ) Article {
302
+ func (sch * Scholar ) QueryArticle (url string , article Article , dumpResponse bool ) Article {
203
303
fmt .Println ("PULLING ARTICLE: " + url )
204
304
article .ScholarURL = url
205
305
client := & http.Client {}
@@ -274,7 +374,8 @@ func (sch Scholar) QueryArticle(url string, article Article, dumpResponse bool)
274
374
article .Articles += 1
275
375
articles := s .Find (".gsc_oci_value" )
276
376
articles .Find (".gsc_oci_merged_snippet" ).Each (func (i int , s * goquery.Selection ) {
277
- // each one of these is an article. For an scholar-example with multiple see: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
377
+ // each one of these is an article. For a scholar-example with multiple see:
378
+ // https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ECQMeb0AAAAJ&citation_for_view=ECQMeb0AAAAJ:u5HHmVD_uO8C
278
379
// this seems to happen if the entry is a book and there are Articles within it
279
380
s .Find (".gsc_oms_link" ).Each (func (i int , l * goquery.Selection ) {
280
381
linkText := l .Text ()
0 commit comments