-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogic.py
209 lines (184 loc) · 7.12 KB
/
logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
This module contains the logic for the Merritt plugin for Janeway
"""
__copyright__ = "Copyright (c) 2023, The Regents of the University of California"
__author__ = "Mahjabeen Yucekul"
__license__ = "BSD 3-Clause"
__maintainer__ = "California Digital Library"
import os
import re
import shutil
import requests
from utils.logger import get_logger
from django.utils import timezone
from .models import PreprintMerrittRequests, MerrittQueue
from django.conf import settings as django_settings
from pathlib import Path
from zipfile import ZipFile
logger = get_logger(__name__)
# create a class that takes preprint id and does all the working needed to
# build zip file for it
"""
Process one preprint by sending it to Merritt using repo setting information
about Merritt collection for the preprint
"""
class PreprintToMerritt:
preprint = None
reposetting = None
def __init__(self, preprint, reposetting):
self.preprint = preprint
self.reposetting = reposetting
def process(self):
# create a PreprintMerrittRequests to keep all the information related to curl request to Merritt
request = PreprintMerrittRequests(preprint = self.preprint, request_date=timezone.now())
request.save()
# update entry in MerrittQueue for the preprint
qitem = MerrittQueue.objects.get_or_create(preprint = self.preprint, defaults={'queue_date':timezone.now() })[0]
qitem.status = MerrittQueue.ItemStatus.PROCESSING
qitem.save()
try:
# put together the article and metadata to create a zip for Merritt
z = ZipForPreprint(self.preprint, self.preprint.repository)
zipfile = z.createZip()
except Exception as e:
# save error message in db
request.response = str(e)
request.status = PreprintMerrittRequests.SubmissionStatus.PREP_ERROR
request.save()
raise
try:
# send the zip file to Merritt using curl command
m = MerrittForPreprint(self.preprint, self.reposetting.merritt_collection, zipfile, request)
m.sendRequest()
request.status = PreprintMerrittRequests.SubmissionStatus.SENT
request.save()
except Exception as e:
# save error message in db
request.response = str(e)
request.status = PreprintMerrittRequests.SubmissionStatus.SEND_ERROR
request.save()
raise
# clear temp
ZipForPreprint.clearTmp()
print("DONE")
"""
Create a zip file for upload to Merritt by combining metadata from OAI endpoint and article file
"""
class ZipForPreprint:
preprint = None
repo = None
tmpfolder = None
filepath_base = "/apps/eschol/janeway/src/files/"
def __init__(self, ppobj, repoobj):
print("creating zip for preprint")
self.preprint = ppobj
self.repo = repoobj
def createZip(self):
print("creating zip file")
# create temp folder if needed
self.tmpfolder = django_settings.MERRITT_TMP + str(self.preprint.id)
zipfile = f'{django_settings.MERRITT_TMP}{self.preprint.id}.zip'
if not os.path.exists(django_settings.MERRITT_TMP):
os.mkdir(django_settings.MERRITT_TMP)
# empty the folder if needed
if os.path.exists(self.tmpfolder):
shutil.rmtree(self.tmpfolder)
os.mkdir(self.tmpfolder)
# copy files temp folder
articlepath = self.copyArticle()
metadatapath = self.generateMetadata()
# generate zip file
with ZipFile(zipfile, 'w') as zip_object:
zip_object.write(articlepath)
zip_object.write(metadatapath)
assert(os.path.exists(zipfile))
return zipfile
def copyArticle(self):
print("copy article file")
fullpath = self.filepath_base + str(self.preprint.current_version.file.file)
temppath = self.tmpfolder + '/' + Path(fullpath).name
shutil.copy(fullpath, temppath)
return temppath
def generateMetadata(self):
print("generate meta data")
params = {
'verb': 'GetRecord',
'identifier': f'oai:{self.repo.short_name}:id:{self.preprint.id}',
'metadataPrefix': 'jats'
}
response = requests.get(f'https://{self.repo.domain}/api/oai', params=params)
# save the metadata in temp folder
xmlname = f'{self.tmpfolder}/meta_{self.preprint.id}.xml'
with open(xmlname, "a") as f:
f.write(response.text)
return xmlname
def clearTmp():
shutil.rmtree(django_settings.MERRITT_TMP)
"""
Send zip to Merritt
"""
class MerrittForPreprint:
preprint = None
collection = None
zipname = None
request = None
def __init__(self, ppobj, colname, zipname, request):
print("created Merritt for preprint")
# give me the repo object from the plugin table with the repo merritt info
self.preprint = ppobj
self.collection = colname
self.zipname = zipname
self.request = request
def sendRequest(self):
print("create request and send Merritt update")
files = {
'file': open(self.zipname, 'rb'),
'type': (None, 'container'),
'submitter': (None, django_settings.MERRITT_USER),
'title': (None, re.sub(r'[^a-zA-Z0-9 ]', '', self.preprint.title)),
'date':(None, str(self.preprint.date_published)),
'creator': (None, self.getCreators()),
'responseForm': (None, 'xml'),
'notificationFormat': (None, 'json'),
'profile': (None, self.collection),
'localIdentifier': (None, self.preprint.id),
}
# save the request info
self.request.request_detail = str(files)
self.request.save()
# send request
response = requests.post(django_settings.MERRITT_URL, files=files, auth=(django_settings.MERRITT_USER, django_settings.MERRITT_KEY))
# save response
self.request.response = response.text
self.request.save()
return
def extractIDs(self, output):
print("extracting ids")
lines = output.splitlines()
for line in lines:
if "<bat:batchID>" in line:
batchId = line.split('>')[1].split('<')[0]
if "<bat:jobID>" in line:
jobId = line.split('>')[1].split('<')[0]
print(batchId)
print(jobId)
def getCreators(self):
print("get the creators")
names = []
count = 0
authors = self.preprint.preprintauthor_set.all()
# add first six authors - same as eschol citation format
for author in authors:
if count == 6:
break
contributor = author.account
if contributor.last_name:
count += 1
name = contributor.last_name
if contributor.first_name:
name += f', {contributor.first_name[0]}.'
names.append(name)
creators = "; ".join(names)
if len(authors) > 6:
creators += ", et al."
return creators