From aec54c133833c29208c2c27d55fdab890bd171c4 Mon Sep 17 00:00:00 2001
From: Richard Jones <richard@cottagelabs.com>
Date: Fri, 20 Oct 2023 10:33:03 +0100
Subject: [PATCH] output journal data directly to csv, rather than in memory

---
 portality/bll/services/journal.py | 50 +++++++++++++++----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/portality/bll/services/journal.py b/portality/bll/services/journal.py
index e7298ad483..728be73f00 100644
--- a/portality/bll/services/journal.py
+++ b/portality/bll/services/journal.py
@@ -254,18 +254,11 @@ def _get_article_kvs(journal):
             return kvs
 
         # ~~!JournalCSV:Feature->Journal:Model~~
-        logger("Loading journal ids")
-        journal_ids = []
-        for j in models.Journal.all_in_doaj(page_size=1000):     #Fixme: limited by ES, this may not be sufficient
-            journal_ids.append(j.id)
-        logger("Journal ids loaded: {x}".format(x=len(journal_ids)))
-
-        cols = {}
-        for jid in journal_ids:
+        csvwriter = csv.writer(file_object)
+        first = True
+        for j in models.Journal.all_in_doaj(page_size=100):
             export_start = datetime.utcnow()
-            logger("Exporting journal {x}".format(x=jid))
-
-            j = models.Journal.pull(jid)
+            logger("Exporting journal {x}".format(x=j.id))
 
             time_log = []
             bj = j.bibjson()
@@ -289,29 +282,36 @@ def _get_article_kvs(journal):
                 for col in additional_columns:
                     additionals += col(j)
             time_log.append("{x} - got additionals".format(x=datetime.utcnow()))
-            cols[issn] = kvs + meta_kvs + article_kvs + additionals
+            row = kvs + meta_kvs + article_kvs + additionals
 
             # Get the toc URL separately from the meta kvs because it needs to be inserted earlier in the CSV
             # ~~-> ToC:WebRoute~~
             toc_kv = _get_doaj_toc_kv(j)
-            cols[issn].insert(2, toc_kv)
+            row.insert(2, toc_kv)
             time_log.append("{x} - got toc kvs".format(x=datetime.utcnow()))
 
+            if first is True:
+                qs = [q for q, _ in row]
+                csvwriter.writerow(qs)
+                first = False
+
+            vs = [v for _, v in row]
+            csvwriter.writerow(vs)
+            time_log.append("{x} - written row to csv".format(x=datetime.utcnow()))
+
             export_end = datetime.utcnow()
             if export_end - export_start > timedelta(seconds=10):
                 for l in time_log:
                     logger(l)
 
-        logger("All journals exported")
-        issns = cols.keys()
-
-        csvwriter = csv.writer(file_object)
-        qs = None
-        for i in sorted(issns):
-            if qs is None:
-                qs = [q for q, _ in cols[i]]
-                csvwriter.writerow(qs)
-            vs = [v for _, v in cols[i]]
-            csvwriter.writerow(vs)
-        logger("CSV Written")
+        logger("All journals exported and CSV written")
+        # issns = cols.keys()
+        # qs = None
+        # for i in sorted(issns):
+        #     if qs is None:
+        #         qs = [q for q, _ in cols[i]]
+        #         csvwriter.writerow(qs)
+        #     vs = [v for _, v in cols[i]]
+        #     csvwriter.writerow(vs)
+        # logger("CSV Written")