Spaces:

faizhalas
/

coconut

Running

App Files Files Community

faizhalas commited on Sep 30

Commit

2d38bac

verified ·

1 Parent(s): 1c732f3

Create tools/sourceformat.py

Browse files

Files changed (1) hide show

tools/sourceformat.py +328 -0

tools/sourceformat.py ADDED Viewed

	@@ -0,0 +1,328 @@

+from io import StringIO, BytesIO
+import pymarc
+import requests
+import string
+import pandas as pd
+import tarfile
+try:
+    from lxml import etree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+#metadata for htrc worksets
+def htrc(self):
+    #variables/arrays and stuff
+    #string of keywords per volume/htid
+    keywords = ""
+    #array of all the keywords per each volume/htid, to add to the file
+    keylist = []
+    #get htids of the volumes
+    htids = self['htid'].values.tolist()
+    #iterate through list of htids
+    for id in range(len(htids)):
+        htid = htids[id]
+        #api call for the extra metadata using htid
+        extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
+        #turn the request into a json file
+        extradata = extradata.json()
+        #get record id and use it to get the xml/marc file with the actual metadata
+        recid = extradata['items'][0]['fromRecord']
+        xmlmarc = extradata['records'][recid]['marc-xml']
+        #turn the formatted xml into an actual pymarc
+        xml = StringIO(xmlmarc)
+        marc = pymarc.parse_xml_to_array(xml)[0]
+        xml.close()
+        for term in marc.get_fields('650'):
+            if "http" in (term.value()).lower():
+                keywords+= ""
+            elif "ocolc" in (term.value()).lower():
+                keywords+=""
+            else:
+                keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
+        keylist.append(keywords)
+    self['Keywords'] = keylist
+    return self
+def htrcxtra(self):
+    #variables/arrays and stuff
+    #string of keywords per volume/htid
+    pages = ""
+    #array of all the keywords per each volume/htid, to add to the file
+    pagecount = []
+    #get htids of the volumes
+    htids = self['htid'].values.tolist()
+    #iterate through list of htids
+    for id in range(len(htids)):
+        htid = htids[id]
+        #api call for the extra metadata using htid
+        extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
+        #turn the request into a json file
+        extradata = extradata.json()
+        #get record id and use it to get the xml/marc file with the actual metadata
+        recid = extradata['items'][0]['fromRecord']
+        xmlmarc = extradata['records'][recid]['marc-xml']
+        #turn the formatted xml into an actual pymarc
+        xml = StringIO(xmlmarc)
+        marc = pymarc.parse_xml_to_array(xml)[0]
+        xml.close()
+    for term in marc.get_fields('350'):
+        pages+=term.value()
+    pagecount.append(pages)
+    self['pages'] = pagecount
+    return self
+#format files from dimensions
+def dim(file):
+    formatted = file.drop(file.columns[[0]],axis=1)
+    done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))
+    return done
+def readPub(tar):
+    #list to put xmls from tarfile in
+    xmllist = []
+    readfile = BytesIO(tar)
+    #get the files from the tarfile into the list
+    files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
+    for member in files.getmembers():
+        singlefile = files.extractfile(member)
+        if singlefile is not None:
+            article = singlefile.read()
+            article = article.decode("utf-8")
+            article = StringIO(article)
+            xmllist.append(article)
+    #lists for each data point
+    titles = []
+    years = []
+    keys = []
+    authors = []
+    publishers = []
+    journaltitles = []
+    #go through each xml file in the list
+    for art in range(len(xmllist)):
+        #make a parseable element tree out of the xml file
+        tree = ET.parse(xmllist[art])
+        root = tree.getroot()
+        #remove parts of the main branch that do not have metadata that we care about
+        for child in list(root):
+            if(child.tag!="front"):
+                root.remove(child)
+        #names to concatnate for each article
+        firstname = []
+        lastname = []
+        #individual strings for multiple keywords/titles
+        key = ""
+        title = ""
+        for target in root.iter('article-title'):
+            if target.text is not None:
+                title += target.text + ", "
+            else:
+                title += " "
+        for target in root.iter('kwd'):
+            if target.text is not None:
+                key+=target.text+ "; "
+            else:
+                key += " "
+        for target in root.iter('year'):
+            year=int(target.text)
+            years.append(year)
+        for names in root.iter('given-names'):
+            firstname.append(names.text)
+        for names in root.iter('surname'):
+            lastname.append(names.text)
+        for target in root.iter('journal-title'):
+            jtitle = target.text
+            journaltitles.append(jtitle)
+        for target in root.iter('publisher-name'):
+            publisher = target.text
+            publishers.append(publisher)
+        titles.append(title)
+        keys.append(key)
+        fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
+        #join the names into a single string with authors
+        author = str.join(', ', fullnames)
+        authors.append(author)
+    data = pd.DataFrame()
+    data["Title"] = pd.Series(titles)
+    data["Keywords"] = pd.Series(keys)
+    data["Authors"] = pd.Series(authors)
+    data["Year"] = pd.Series(years)
+    data["Document Type"] = pd.Series(publisher)
+    data["Source title"] = pd.Series(journaltitles)
+    data.fillna(value = "empty", inplace = True)
+    return data
+def readxml(file):
+    root = ET.fromstring(file)
+    #remove stuff from the xml that we do not need
+    for child in list(root):
+        for lchild in list(child):
+            if(lchild.tag!="front"):
+                child.remove(lchild)
+    #get stuff
+    keys = []
+    titles = []
+    authors = []
+    jtitle = []
+    publishers = []
+    years = []
+    for child in list(root):
+        for article in list(child):
+            key = ""
+            firstname = []
+            lastname = []
+            for target in article.iter('article-title'):
+                if target.text is not None:
+                    titles.append(target.text)
+                else:
+                    titles.append("empty")
+            for target in article.iter('kwd'):
+                if target.text is not None:
+                    key+= target.text + "; "
+                else:
+                    key += ""
+            keys.append(key)
+            for target in article.iter('given-names'):
+                firstname.append(target.text)
+            for target in article.iter('surname'):
+                lastname.append(target.text)
+            fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
+            author = str.join(', ', fullnames)
+            authors.append(author)
+            for target in article.iter('journal-title'):
+                jtitle.append(target.text)
+            for target in article.iter('publisher-name'):
+                publishers.append(target.text)
+            for target in article.iter('year'):
+                years.append(int(target.text))
+    frame = pd.DataFrame()
+    frame["Title"] = pd.Series(titles)
+    frame["Keywords"] = pd.Series(keys)
+    frame["Authors"] = pd.Series(authors)
+    frame["Year"] = pd.Series(years)
+    frame["Document Type"] = pd.Series(jtitle)
+    frame["Source title"] = pd.Series(publishers)
+    frame.fillna(value = "empty", inplace = True)
+    return frame
+def medline(file):
+    textfile = file.read()
+    text = textfile.decode()
+    authors = []
+    titles = []
+    year = []
+    meshkeys = []
+    otherkeys = []
+    #articles are separated by newlines so seperate them
+    articles = text.split('\n\n')
+    for paper in articles:
+        names = ""
+        meshk = ""
+        otherk = ""
+        largetext = paper.splitlines()
+        for line in largetext:
+            #title
+            if "TI  - " in line:
+                #checking if the title goes over another line, and to add it if it does
+                startpos = line.index("-") + 2
+                if "- " not in(largetext[largetext.index(line)+1]):
+                    titles.append(line[startpos:] +  " " + largetext[largetext.index(line)+1].strip())
+                else:
+                    titles.append(line[startpos:])
+            #author
+            if "FAU - " in line:
+                startpos = line.index("-") + 2
+                names+= line[startpos:] + "; "
+            #year
+            if "DP  - " in line:
+                startpos = line.index("-") + 2
+                year.append(int(line[startpos:startpos+4]))
+            #key terms
+            if "MH  - " in line:
+                startpos = line.index("-") + 2
+                meshk += line[startpos:] + "; "
+            if"OT  - " in line:
+                startpos = line.index("-") + 2
+                otherk += line[startpos:] + "; "
+        authors.append(names)
+        meshkeys.append(meshk)
+        otherkeys.append(otherk)
+    frame = pd.DataFrame()
+    frame['Title'] = pd.Series(titles)
+    frame['Authors'] = pd.Series(authors)
+    frame['Year'] = pd.Series(year)
+    frame['MeSH Keywords'] = pd.Series(meshkeys)
+    frame['Other Keywords'] = pd.Series(otherkeys)
+    frame.fillna(value = "empty", inplace = True)
+    return frame