Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON záznam pro Harvest #584

Merged
merged 21 commits into from
Aug 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fdf4cb3
add TopicCollection.[collection_alias, aggregation_with_same_type], i…
Fasand May 4, 2021
1e13521
first shot at creating JSON for Harvest with many TODOs RE #402
Fasand May 10, 2021
464046a
use DEBUG=True for local docker, don't cache templates if in DEBUG
Fasand May 10, 2021
fedea62
num sources in a category are computed for PUBLIC sources, not ARCHIV…
Fasand May 10, 2021
f93c10a
Harvest custom sources don't need to have archiving states
Fasand May 10, 2021
08a6ae2
add Topic Collection seed URLs link to detail
Fasand May 10, 2021
bb82c05
update www bootstrap to 4.6, fix broken styles
Fasand May 10, 2021
eeb5c52
add seed URLs buttons to Topic Collection seeder and web (csv)
Fasand May 10, 2021
2637b3c
add collectstatic to prepare.sh
Fasand May 17, 2021
42ff335
fix visible-sm Bootstrap classes, remove clearfixes (don't do anythin…
Fasand May 17, 2021
46c8935
add active page underlining to Nominate
Fasand May 17, 2021
651b029
fix Select2 vs Bootstrap form-control styling – selected option overflow
Fasand May 17, 2021
1d774de
swap current/wayback URLs in WWW TopicCollection detail, swap normal/…
Fasand May 17, 2021
ee746f1
only show Django toolbar for fasand/petr
Fasand May 28, 2021
5af8d04
topic collections: add wayback_url to non-archived and custom seeds, …
Fasand May 28, 2021
e841ad7
add TODO for adding HarvestEditView harvest_type checking
Fasand May 17, 2021
7571c74
merge harvest-json with master
Fasand May 28, 2021
1e378ab
include custom seeds in the OneShot collection
Fasand May 28, 2021
1586637
remove tabs/newlines from custom_seeds when serving JSON, update TODO…
Fasand May 28, 2021
18e10ca
add combined aliases and annotations to Harvest JSON
Fasand May 28, 2021
c9819bf
change Harvest.scheduled_on to DateTimeField, use stock DateTimeInput…
Fasand Aug 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Seeder/core/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from django.db import models
from django.utils import timezone
from django.forms.widgets import DateTimeInput

from core import widgets

Expand All @@ -14,6 +15,16 @@ def formfield(self, **kwargs):
return super().formfield(**defaults)


class DateTimePickerField(models.DateTimeField):
def formfield(self, **kwargs):
defaults = {
'widget': DateTimeInput(format="%d.%m.%Y %H:%M")
}

defaults.update(kwargs)
return super().formfield(**defaults)


class BaseModel(models.Model):
active = models.BooleanField(default=True)
created = models.DateTimeField(default=timezone.now, editable=False)
Expand Down
10 changes: 10 additions & 0 deletions Seeder/core/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ def __init__(self, attrs=None, format='%Y-%m-%d'):
super().__init__(attrs=attrs, format=format)


class DateTimePickerWidget(widgets.DateTimeInput):
def __init__(self, attrs=None, format='%Y-%m-%dT%H:%M'):
# Use the HTML datetime-local widget
if attrs is not None:
attrs.update({'type': 'datetime-local'})
else:
attrs = {'type': 'datetime-local'}
super().__init__(attrs=attrs, format=format)


class DateRangeWidget(widgets.MultiWidget):
def __init__(self, **kwargs):
super().__init__(
Expand Down
23 changes: 23 additions & 0 deletions Seeder/harvests/migrations/0015_auto_20210504_0731.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 2.2.20 on 2021-05-04 07:31

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('harvests', '0014_harvest_seeds_not_harvested'),
]

operations = [
migrations.AddField(
model_name='topiccollection',
name='aggregation_with_same_type',
field=models.BooleanField(default=True, verbose_name='Aggregation with same type'),
),
migrations.AddField(
model_name='topiccollection',
name='collection_alias',
field=models.CharField(blank=True, max_length=64, verbose_name='Collection alias'),
),
]
24 changes: 24 additions & 0 deletions Seeder/harvests/migrations/0016_auto_20210803_0835.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 2.2.20 on 2021-08-03 08:35

import core.models
from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('harvests', '0015_auto_20210504_0731'),
]

operations = [
migrations.AlterField(
model_name='harvest',
name='scheduled_on',
field=core.models.DateTimePickerField(verbose_name='Date of harvest'),
),
migrations.AlterField(
model_name='topiccollection',
name='scheduled_on',
field=core.models.DateTimePickerField(blank=True, null=True, verbose_name='Date of harvest'),
),
]
185 changes: 175 additions & 10 deletions Seeder/harvests/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os

from itertools import chain
from datetime import date
from django.contrib import messages
from hashlib import md5
from django.utils import timezone

from django.db import models
from django.utils.translation import ugettext_lazy as _
Expand All @@ -16,7 +16,7 @@
from ordered_model.models import OrderedModel

from blacklists.models import Blacklist
from core.models import BaseModel, DatePickerField
from core.models import BaseModel, DatePickerField, DateTimePickerField
from harvests.scheduler import get_dates_for_timedelta
from source import constants as source_constants
from source.models import Source, Seed, KeyWord
Expand Down Expand Up @@ -72,6 +72,35 @@ def repr(self):
def __str__(self):
return self.repr()

@staticmethod
def hash_seeds(seeds):
return md5("\n".join(seeds).encode("utf-8")).hexdigest()

def construct_collection_json(
self, seeds, name, collectionAlias, annotation, nameCurator,
idCollection, aggregationWithSameType, blacklisted=None):
"""
seeds: set()
blacklisted: set()
"""
if blacklisted is None:
blacklisted = self.get_blacklisted()
seeds = sorted(seeds - blacklisted)
# Collections shouldn't be empty
if len(seeds) == 0:
return None
return {
"name": name,
"collectionAlias": collectionAlias,
"annotation": annotation,
"nameCurator": nameCurator,
"idCollection": idCollection,
"aggregationWithSameType": aggregationWithSameType,
"hash": self.hash_seeds(seeds),
"seedsNo": len(seeds),
"seeds": seeds,
}

def pair_custom_seeds(self):
"""
Tries to pair the urls from ``custom_seeds`` with existing sources
Expand All @@ -94,7 +123,8 @@ def get_blacklisted(self):
def get_custom_seeds(self):
if not self.custom_seeds:
return set()
return set(self.custom_seeds.splitlines())
# Unwanted tabs and newlines can appear when entering as text
return set(map(str.strip, self.custom_seeds.splitlines())) - set([""])

def get_custom_sources_seeds(self):
seeds = Seed.objects.filter(
Expand Down Expand Up @@ -158,7 +188,6 @@ class Harvest(HarvestAbstractModel):
)

# Only Harvests with these states will be checked in prev_harv_seeds
# TODO: make sure these harvest states make sense
PREVIOUSLY_HARVESTED_STATES = (
STATE_RUNNING, STATE_SUCCESS, STATE_SUCCESS_WITH_FAILURES
)
Expand Down Expand Up @@ -189,7 +218,7 @@ class Harvest(HarvestAbstractModel):

auto_created = models.BooleanField(default=False)

scheduled_on = DatePickerField(
scheduled_on = DateTimePickerField(
verbose_name=_('Date of harvest')
)

Expand Down Expand Up @@ -218,9 +247,10 @@ class Harvest(HarvestAbstractModel):

def get_topic_collections_by_frequency(self):
pks = []
for freq in self.topic_collection_frequency:
pks.extend(TopicCollection.get_harvests_by_frequency(
freq).values_list('pk', flat=True))
if self.topic_collection_frequency:
for freq in self.topic_collection_frequency:
pks.extend(TopicCollection.get_harvests_by_frequency(
freq).values_list('pk', flat=True))
return TopicCollection.objects.filter(pk__in=pks)

def get_previously_harvested_seeds(self):
Expand All @@ -232,6 +262,120 @@ def get_previously_harvested_seeds(self):
seeds.update(h.get_seeds())
return seeds

def get_serials_frequency_json(self, frequency, blacklisted=None):
# Disregard OneShot seeds, should be dealt with separately
if frequency == 0:
return None
seeds = set(Seed.objects.archiving().filter(
source__frequency=frequency).values_list('url', flat=True))
alias = f"M{frequency}"
return self.construct_collection_json(
seeds, blacklisted=blacklisted,
name=f"Serials_{alias}_{timezone.now():%Y-%m-%d}",
collectionAlias=alias,
annotation=f"Serials sklizeň s frekvencí {frequency}x ročně",
nameCurator=None,
idCollection=None, # TODO: no real ID
aggregationWithSameType=True,
)

def get_json(self):
# TODO: should figure out how to freeze/recognize correctly frozen
# TODO: could add json_frozen
# if self.seeds_frozen and self.seeds_frozen != '':
# return set(self.seeds_frozen.splitlines())

# Pre-compute blacklisted and pass down to all functions
blacklisted = self.get_blacklisted()

collections = []

# TODO: where should I check if there are topics+serials? – in Edit/Create Form, don't allow to create/change Harvest to something unsupported but if it already exists, it's fine

# Add selected topic collections
for tc in self.topic_collections.all():
collections.append(tc.get_collection_json(blacklisted))
# Add all topic collections by frequency
for tc in self.get_topic_collections_by_frequency():
# Ensure topic collection hasn't already been added
tc_json = tc.get_collection_json(blacklisted)
if tc_json and not any(
[tc_json["idCollection"] == c.get("idCollection")
# Collection can be None if it has no seeds
for c in collections if c is not None]
):
collections.append(tc_json)
# Add frequency serials, auto-ignores OneShots
if self.target_frequency:
for freq in self.target_frequency:
collections.append(
self.get_serials_frequency_json(freq, blacklisted))
# Pre-compute previously harvested seeds if OneShot or ArchiveIt
if self.archive_it or self.is_oneshot:
previously_harvested = self.get_previously_harvested_seeds()
oneshot_seeds = self.get_oneshot_seeds(
blacklisted, previously_harvested)
custom_seeds = super(Harvest, self).get_seeds(blacklisted)
# OneShot collections contain OneShot and Custom sources/seeds
collections.append(self.construct_collection_json(
oneshot_seeds | custom_seeds, blacklisted=blacklisted,
name=f"Serials_OneShot_{timezone.now():%Y-%m-%d}",
collectionAlias="OneShot",
annotation="Serials sklizen pro OneShot+Custom seminka",
nameCurator=None,
idCollection=None,
aggregationWithSameType=True,
))
archiveit_seeds = self.get_archiveit_seeds(
blacklisted, previously_harvested)
collections.append(self.construct_collection_json(
archiveit_seeds, blacklisted=blacklisted,
name=f"Serials_ArchiveIt_{timezone.now():%Y-%m-%d}",
collectionAlias="ArchiveIt",
annotation="Vyber ArchiveIt seminek k archivaci",
nameCurator=None,
idCollection=None,
aggregationWithSameType=True,
))
if self.tests:
tests_seeds = self.get_tests_seeds(blacklisted)
collections.append(self.construct_collection_json(
tests_seeds, blacklisted=blacklisted,
name=f"Serials_Tests_{timezone.now():%Y-%m-%d}",
collectionAlias="Tests",
annotation="Vyber seminek na testovani",
nameCurator=None,
idCollection=None,
aggregationWithSameType=True,
))

# Filter out any potential None from collections
collections = [c for c in collections if c is not None]
# Get all seeds combined
seeds_combined = sum([c.get("seeds") for c in collections], [])
aliases = "-".join([c.get("collectionAlias") for c in collections])
annotations = " ~ ".join([c.get("annotation") for c in collections])

return {
"idHarvest": self.pk,
"dateGenerated": timezone.now().isoformat(),
"dateFrozen": "self.date_frozen.isoformat()", # TODO field
# TODO is this scheduled_on or a new field? scheduled is only date
"plannedStart": "self.(planned_start | scheduled_on).isoformat()",
"type": "serials", # TODO field
"combined": True, # TODO field or rule?
# TODO: can get super long if many topic collections / frequencies
"name": f"Serials_YYYY-MM-DD_{aliases}",
"anotation": annotations,
"hash": self.hash_seeds(seeds_combined),
"seedsNo": len(seeds_combined),
"duration": 259200, # TODO new field or model?
"budget": 10000, # TODO new field or model?
"dataLimit": 10000000000, # TODO new field or model?
"documentLimit": 0, # TODO new field or model?
"collections": collections,
}

def get_seeds_by_frequency(self, blacklisted=None):
if not self.target_frequency:
return set()
Expand Down Expand Up @@ -431,7 +575,7 @@ class TopicCollection(HarvestAbstractModel, OrderedModel):

auto_created = models.BooleanField(default=False)

scheduled_on = DatePickerField(
scheduled_on = DateTimePickerField(
verbose_name=_('Date of harvest'),
null=True, blank=True,
)
Expand All @@ -452,6 +596,12 @@ class TopicCollection(HarvestAbstractModel, OrderedModel):
date_from = DatePickerField(_('Date from'), null=True)
date_to = DatePickerField(_('Date to'), null=True, blank=True)

# Harvest-specific fields
collection_alias = models.CharField(
_("Collection alias"), max_length=64, blank=True)
aggregation_with_same_type = models.BooleanField(
_("Aggregation with same type"), default=True)

def get_www_url(self):
return reverse('www:collection_detail', kwargs={"slug": self.slug})

Expand All @@ -468,6 +618,21 @@ def update_slug(self):
self.slug = unique_slug
self.save()

def get_collection_json(self, blacklisted=None):
""" Returns a dict() with topic collection details and seeds """
alias = (self.collection_alias if len(self.collection_alias) > 0
else "NoAlias")
return self.construct_collection_json(
self.get_seeds(), blacklisted=blacklisted,
# TODO: is this the date of creation or now?
name=f"Topics_{alias}_{timezone.now():%Y-%m-%d}",
collectionAlias=alias,
annotation=self.annotation,
nameCurator=self.title,
idCollection=self.pk,
aggregationWithSameType=self.aggregation_with_same_type,
)

def __str__(self):
sign = '✔' if self.active else '✗'
return '{0} {1}'.format(sign, self.title)
Expand Down
1 change: 1 addition & 0 deletions Seeder/harvests/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def to_url(self, value):
path('<date:h_date>/harvests', ListHarvestUrls.as_view(),
name='harvest_urls'),
path('<int:pk>/urls', ListUrls.as_view(), name='urls'),
path('<int:pk>/json', JsonUrls.as_view(), name='json'),
# Harvest URLs based on type
path('<date:h_date>/shortcut_urls',
ListShortcutUrlsByDate.as_view(), name='shortcut_urls_by_date'),
Expand Down
Loading