faizhalas commited on
Commit
2d38bac
·
verified ·
1 Parent(s): 1c732f3

Create tools/sourceformat.py

Browse files
Files changed (1) hide show
  1. tools/sourceformat.py +328 -0
tools/sourceformat.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO, BytesIO
2
+ import pymarc
3
+ import requests
4
+ import string
5
+ import pandas as pd
6
+ import tarfile
7
+ try:
8
+ from lxml import etree as ET
9
+ except ImportError:
10
+ import xml.etree.ElementTree as ET
11
+
12
+ #metadata for htrc worksets
13
+ def htrc(self):
14
+
15
+ #variables/arrays and stuff
16
+
17
+ #string of keywords per volume/htid
18
+ keywords = ""
19
+
20
+ #array of all the keywords per each volume/htid, to add to the file
21
+ keylist = []
22
+
23
+ #get htids of the volumes
24
+ htids = self['htid'].values.tolist()
25
+ #iterate through list of htids
26
+ for id in range(len(htids)):
27
+ htid = htids[id]
28
+
29
+ #api call for the extra metadata using htid
30
+ extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
31
+
32
+ #turn the request into a json file
33
+ extradata = extradata.json()
34
+
35
+ #get record id and use it to get the xml/marc file with the actual metadata
36
+ recid = extradata['items'][0]['fromRecord']
37
+ xmlmarc = extradata['records'][recid]['marc-xml']
38
+
39
+ #turn the formatted xml into an actual pymarc
40
+ xml = StringIO(xmlmarc)
41
+ marc = pymarc.parse_xml_to_array(xml)[0]
42
+ xml.close()
43
+
44
+ for term in marc.get_fields('650'):
45
+ if "http" in (term.value()).lower():
46
+ keywords+= ""
47
+ elif "ocolc" in (term.value()).lower():
48
+ keywords+=""
49
+ else:
50
+ keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
51
+ keylist.append(keywords)
52
+ self['Keywords'] = keylist
53
+ return self
54
+
55
+ def htrcxtra(self):
56
+
57
+ #variables/arrays and stuff
58
+
59
+ #string of keywords per volume/htid
60
+ pages = ""
61
+
62
+ #array of all the keywords per each volume/htid, to add to the file
63
+ pagecount = []
64
+
65
+ #get htids of the volumes
66
+ htids = self['htid'].values.tolist()
67
+ #iterate through list of htids
68
+ for id in range(len(htids)):
69
+ htid = htids[id]
70
+
71
+ #api call for the extra metadata using htid
72
+ extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
73
+
74
+ #turn the request into a json file
75
+ extradata = extradata.json()
76
+
77
+ #get record id and use it to get the xml/marc file with the actual metadata
78
+ recid = extradata['items'][0]['fromRecord']
79
+ xmlmarc = extradata['records'][recid]['marc-xml']
80
+
81
+ #turn the formatted xml into an actual pymarc
82
+ xml = StringIO(xmlmarc)
83
+ marc = pymarc.parse_xml_to_array(xml)[0]
84
+ xml.close()
85
+
86
+ for term in marc.get_fields('350'):
87
+ pages+=term.value()
88
+ pagecount.append(pages)
89
+ self['pages'] = pagecount
90
+ return self
91
+
92
+
93
+ #format files from dimensions
94
+ def dim(file):
95
+ formatted = file.drop(file.columns[[0]],axis=1)
96
+
97
+ done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))
98
+
99
+ return done
100
+
101
+
102
+
103
+ def readPub(tar):
104
+
105
+ #list to put xmls from tarfile in
106
+ xmllist = []
107
+
108
+ readfile = BytesIO(tar)
109
+
110
+ #get the files from the tarfile into the list
111
+ files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
112
+ for member in files.getmembers():
113
+ singlefile = files.extractfile(member)
114
+ if singlefile is not None:
115
+ article = singlefile.read()
116
+ article = article.decode("utf-8")
117
+ article = StringIO(article)
118
+ xmllist.append(article)
119
+
120
+ #lists for each data point
121
+ titles = []
122
+ years = []
123
+ keys = []
124
+ authors = []
125
+ publishers = []
126
+ journaltitles = []
127
+
128
+ #go through each xml file in the list
129
+ for art in range(len(xmllist)):
130
+
131
+ #make a parseable element tree out of the xml file
132
+ tree = ET.parse(xmllist[art])
133
+ root = tree.getroot()
134
+
135
+ #remove parts of the main branch that do not have metadata that we care about
136
+ for child in list(root):
137
+ if(child.tag!="front"):
138
+ root.remove(child)
139
+
140
+ #names to concatnate for each article
141
+ firstname = []
142
+ lastname = []
143
+
144
+ #individual strings for multiple keywords/titles
145
+ key = ""
146
+ title = ""
147
+
148
+
149
+ for target in root.iter('article-title'):
150
+ if target.text is not None:
151
+ title += target.text + ", "
152
+ else:
153
+ title += " "
154
+ for target in root.iter('kwd'):
155
+ if target.text is not None:
156
+ key+=target.text+ "; "
157
+ else:
158
+ key += " "
159
+ for target in root.iter('year'):
160
+ year=int(target.text)
161
+ years.append(year)
162
+ for names in root.iter('given-names'):
163
+ firstname.append(names.text)
164
+ for names in root.iter('surname'):
165
+ lastname.append(names.text)
166
+ for target in root.iter('journal-title'):
167
+ jtitle = target.text
168
+ journaltitles.append(jtitle)
169
+ for target in root.iter('publisher-name'):
170
+ publisher = target.text
171
+ publishers.append(publisher)
172
+
173
+ titles.append(title)
174
+ keys.append(key)
175
+
176
+ fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
177
+
178
+ #join the names into a single string with authors
179
+ author = str.join(', ', fullnames)
180
+
181
+ authors.append(author)
182
+
183
+ data = pd.DataFrame()
184
+
185
+ data["Title"] = pd.Series(titles)
186
+ data["Keywords"] = pd.Series(keys)
187
+ data["Authors"] = pd.Series(authors)
188
+ data["Year"] = pd.Series(years)
189
+ data["Document Type"] = pd.Series(publisher)
190
+ data["Source title"] = pd.Series(journaltitles)
191
+
192
+ data.fillna(value = "empty", inplace = True)
193
+
194
+ return data
195
+
196
+
197
+ def readxml(file):
198
+ root = ET.fromstring(file)
199
+
200
+
201
+
202
+ #remove stuff from the xml that we do not need
203
+ for child in list(root):
204
+ for lchild in list(child):
205
+ if(lchild.tag!="front"):
206
+ child.remove(lchild)
207
+
208
+ #get stuff
209
+
210
+ keys = []
211
+ titles = []
212
+ authors = []
213
+ jtitle = []
214
+ publishers = []
215
+ years = []
216
+
217
+ for child in list(root):
218
+ for article in list(child):
219
+ key = ""
220
+ firstname = []
221
+ lastname = []
222
+ for target in article.iter('article-title'):
223
+
224
+ if target.text is not None:
225
+ titles.append(target.text)
226
+ else:
227
+ titles.append("empty")
228
+ for target in article.iter('kwd'):
229
+ if target.text is not None:
230
+ key+= target.text + "; "
231
+ else:
232
+ key += ""
233
+ keys.append(key)
234
+ for target in article.iter('given-names'):
235
+ firstname.append(target.text)
236
+ for target in article.iter('surname'):
237
+ lastname.append(target.text)
238
+
239
+ fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
240
+ author = str.join(', ', fullnames)
241
+ authors.append(author)
242
+
243
+ for target in article.iter('journal-title'):
244
+ jtitle.append(target.text)
245
+ for target in article.iter('publisher-name'):
246
+ publishers.append(target.text)
247
+
248
+ for target in article.iter('year'):
249
+ years.append(int(target.text))
250
+
251
+ frame = pd.DataFrame()
252
+
253
+ frame["Title"] = pd.Series(titles)
254
+ frame["Keywords"] = pd.Series(keys)
255
+ frame["Authors"] = pd.Series(authors)
256
+ frame["Year"] = pd.Series(years)
257
+ frame["Document Type"] = pd.Series(jtitle)
258
+ frame["Source title"] = pd.Series(publishers)
259
+
260
+ frame.fillna(value = "empty", inplace = True)
261
+
262
+ return frame
263
+
264
+ def medline(file):
265
+
266
+ textfile = file.read()
267
+
268
+
269
+ text = textfile.decode()
270
+
271
+
272
+
273
+
274
+
275
+ authors = []
276
+ titles = []
277
+ year = []
278
+ meshkeys = []
279
+ otherkeys = []
280
+
281
+ #articles are separated by newlines so seperate them
282
+ articles = text.split('\n\n')
283
+
284
+ for paper in articles:
285
+ names = ""
286
+ meshk = ""
287
+ otherk = ""
288
+ largetext = paper.splitlines()
289
+ for line in largetext:
290
+ #title
291
+ if "TI - " in line:
292
+ #checking if the title goes over another line, and to add it if it does
293
+ startpos = line.index("-") + 2
294
+ if "- " not in(largetext[largetext.index(line)+1]):
295
+ titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip())
296
+ else:
297
+ titles.append(line[startpos:])
298
+ #author
299
+ if "FAU - " in line:
300
+ startpos = line.index("-") + 2
301
+ names+= line[startpos:] + "; "
302
+ #year
303
+ if "DP - " in line:
304
+ startpos = line.index("-") + 2
305
+ year.append(int(line[startpos:startpos+4]))
306
+ #key terms
307
+ if "MH - " in line:
308
+ startpos = line.index("-") + 2
309
+ meshk += line[startpos:] + "; "
310
+ if"OT - " in line:
311
+ startpos = line.index("-") + 2
312
+ otherk += line[startpos:] + "; "
313
+
314
+ authors.append(names)
315
+ meshkeys.append(meshk)
316
+ otherkeys.append(otherk)
317
+
318
+ frame = pd.DataFrame()
319
+
320
+ frame['Title'] = pd.Series(titles)
321
+ frame['Authors'] = pd.Series(authors)
322
+ frame['Year'] = pd.Series(year)
323
+ frame['MeSH Keywords'] = pd.Series(meshkeys)
324
+ frame['Other Keywords'] = pd.Series(otherkeys)
325
+
326
+ frame.fillna(value = "empty", inplace = True)
327
+
328
+ return frame