Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data rearchitecture] Implement article status manager for timeslices #6083

Merged
10 changes: 10 additions & 0 deletions app/models/course.rb
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,16 @@ def category_article_ids
categories.inject([]) { |ids, cat| ids + cat.article_ids }
end

# Retrieve articles based on the existing article course timeslices.
# This includes both tracked and untracked articles, such as those that
# don't belong to a tracked namespace.
def articles_from_timeslices(wiki_id)
Article.joins(:article_course_timeslices)
.where(article_course_timeslices: { course_id: id })
.where(wiki_id:)
.distinct
end

def update_until
self.end + UPDATE_LENGTH
end
Expand Down
1 change: 1 addition & 0 deletions app/models/wiki_content/article.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Article < ApplicationRecord
has_many :revisions
has_many :editors, through: :revisions, source: :user
has_many :articles_courses, class_name: 'ArticlesCourses'
has_many :article_course_timeslices
has_many :courses, -> { distinct }, through: :articles_courses
has_many :assignments
belongs_to :wiki
Expand Down
9 changes: 3 additions & 6 deletions app/services/update_course_stats_timeslice.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/course_revision_updater"
require_dependency "#{Rails.root}/lib/article_status_manager"
require_dependency "#{Rails.root}/lib/article_status_manager_timeslice"
require_dependency "#{Rails.root}/lib/importers/course_upload_importer"
require_dependency "#{Rails.root}/lib/data_cycle/update_logger"
require_dependency "#{Rails.root}/lib/analytics/histogram_plotter"
Expand All @@ -28,8 +28,8 @@ def initialize(course)
@start_time = Time.zone.now
import_uploads
update_categories
@timeslice_errors = UpdateCourseWikiTimeslices.new(@course).run(all_time: @full_update)
update_article_status if should_update_article_status?
@timeslice_errors = UpdateCourseWikiTimeslices.new(@course).run(all_time: @full_update)
update_average_pageviews
update_caches
update_wikidata_stats if wikidata
Expand All @@ -55,11 +55,8 @@ def update_categories
end

def update_article_status
# TODO: note this is not wiki scoped.
ArticleStatusManager.update_article_status_for_course(@course)
ArticleStatusManagerTimeslice.update_article_status_for_course(@course)
@debugger.log_update_progress :article_status_updated

# TODO: replace the logic on ModifiedRevisionsManager.new(@wiki).move_or_delete_revisions
end

def update_average_pageviews
Expand Down
6 changes: 3 additions & 3 deletions app/services/update_course_wiki_timeslices.rb
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ def update_article_course_timeslices_for_wiki(revisions)
revs = revisions[:revisions]
revs.group_by(&:article_id).each do |article_id, article_revisions|
# We don't create articles courses for every article
article_course = ArticlesCourses.find_by(course: @course, article_id:)
next unless article_course
# article_course = ArticlesCourses.find_by(course: @course, article_id:)
# next unless article_course

# Update cache for ArticleCorseTimeslice
# Update cache for ArticleCourseTimeslice
article_revisions_data = { start: start_period, end: end_period,
revisions: article_revisions }
ArticleCourseTimeslice.update_article_course_timeslices(@course, article_id,
Expand Down
37 changes: 6 additions & 31 deletions app/services/update_timeslices_course_user.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,19 @@ def add_user_ids(user_ids)
return if user_ids.empty?

@course.wikis.each do |wiki|
fetch_users_revisions_for_wiki(wiki, user_ids, @course.start, @course.end)
fetch_users_revisions_for_wiki(wiki, user_ids)
end
end

def fetch_users_revisions_for_wiki(wiki, user_ids, first_start, latest_start)
current_start = first_start
def fetch_users_revisions_for_wiki(wiki, user_ids)
users = User.find(user_ids)

manager = RevisionDataManager.new(wiki, @course)
current_end = latest_start + @timeslice_manager.timeslice_duration(wiki)
# Fetch the revisions for users for the complete period
revisions = manager.fetch_revision_data_for_users(users,
current_start.strftime('%Y%m%d%H%M%S'),
current_end.strftime('%Y%m%d%H%M%S'))
wikis_and_starts = revisions_to_wiki_and_start_dates(revisions,
wiki,
first_start,
latest_start)

@timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
@course.start.strftime('%Y%m%d%H%M%S'),
@course.end.strftime('%Y%m%d%H%M%S'))
@timeslice_manager.update_timeslices_that_need_update_from_revisions(revisions, wiki)
end

def remove_courses_users(user_ids)
Expand All @@ -67,10 +60,9 @@ def remove_courses_users(user_ids)
# Do this to avoid running the query twice
@article_course_timeslices_for_users = get_article_course_timeslices_for_users(user_ids)
# Mark course wiki timeslices that needs to be re-proccesed
wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(
@timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(
@article_course_timeslices_for_users
)
@timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
# Clean articles courses timeslices
clean_article_course_timeslices
# Delete articles courses that were updated only for removed users
Expand All @@ -87,23 +79,6 @@ def get_article_course_timeslices_for_users(user_ids)
timeslices.flatten
end

def revisions_to_wiki_and_start_dates(revisions, wiki, first_start, latest_start)
tuples = []
current_start = first_start
while current_start <= latest_start
current_end = current_start + @timeslice_manager.timeslice_duration(wiki)
revisions_per_timeslice = revisions.select do |r|
current_start <= r.date && r.date < current_end
end
unless revisions_per_timeslice.empty?
tuples += [[wiki.id,
current_start.strftime('%Y%m%d%H%M%S')]]
end
current_start += @timeslice_manager.timeslice_duration(wiki)
end
tuples
end

def clean_article_course_timeslices
ids = @article_course_timeslices_for_users.map(&:id)
ArticleCourseTimeslice.where(id: ids).update_all(character_sum: 0, # rubocop:disable Rails/SkipsModelValidations
Expand Down
6 changes: 2 additions & 4 deletions app/services/update_timeslices_untracked_article.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def untrack(article_ids)
# Only reprocess those non-empty timeslices
non_empty = @article_course_timeslices_to_untrack.where.not(user_ids: nil)
# Mark course wiki timeslices that needs to be re-proccesed
wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(non_empty)
@timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
@timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(non_empty)
untrack_timeslices
end

Expand All @@ -53,8 +52,7 @@ def retrack(article_ids)
non_empty = @article_course_timeslices_to_retrack.where.not(user_ids: nil)

# Mark course wiki timeslices that needs to be re-proccesed
wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(non_empty)
@timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
@timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(non_empty)
retrack_timeslices
end

Expand Down
229 changes: 229 additions & 0 deletions lib/article_status_manager_timeslice.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice"
require_dependency "#{Rails.root}/lib/assignment_updater"

#= Updates articles to reflect deletion and page moves on Wikipedia
# This class is responsible for two main things (we may separate it in the future).
# - Iterate over article course timeslices to sync articles. This updates title,
# namespace, deleted and mw_page_id fields.
# - Reset articles according to their new status:
# * Articles that were deleted or untracked. These are articles that were either
# deleted or moved to a namespace not traceable by the course. Such articles
# should be excluded from course statistics.
# * Articles that were restored or re-tracked: These are articles that were
# either undeleted or moved to a namespace relevant to the course. Such articles
# should be included in course statistics.
# TODO: this class can probably be made simpler

class ArticleStatusManagerTimeslice
def initialize(course, wiki = nil)
@course = course
@wiki = wiki || Wiki.default_wiki
end

################
# Entry points #
################

def self.update_article_status_for_course(course)
course.wikis.each do |wiki|
# Retrieve articles based on ac timeslices to also include current untracked articles.
course.articles_from_timeslices(wiki.id)
# Updating only those articles which are updated more than 1 day ago
.where('articles.updated_at < ?', 1.day.ago)
.in_batches do |article_batch|
# Using in_batches so that the update_at of all articles in the batch can be
# excuted in a single query, otherwise if we use find_in_batches, query for
# each article for updating the same would be required
new(course, wiki).update_status(article_batch)
# rubocop:disable Rails/SkipsModelValidations
article_batch.touch_all(:updated_at)
# rubocop:enable Rails/SkipsModelValidations
end
end

ArticlesCoursesCleanerTimeslice.reset_articles_for_course(course)
end

####################
# Per-wiki methods #
####################

def update_status(articles)
# This is used for problem cases where articles can't be easily disambiguated
# because of duplicate records with the same mw_page_id. It's only used if
# `articles` is just one record.
@article = articles.first if articles.one?

identify_deleted_and_synced_page_ids(articles)

# First we find any pages that just moved, and update title and namespace.
update_title_and_namespace @synced_articles

# Now we check for pages that have changed mw_page_ids.
# This happens in situations such as history merges.
# If articles move in between title/namespace updates and mw_page_id updates,
# then it's possible to end up with inconsistent data.
update_article_ids @deleted_page_ids

# Mark articles as deleted as appropriate
update_deleted_articles(articles)
move_timeslices_for_deleted_articles(articles)
end

##################
# Helper methods #
##################
private

def identify_deleted_and_synced_page_ids(articles)
@synced_articles = article_data_from_replica(articles)
@synced_ids = @synced_articles.map { |a| a['page_id'].to_i }

# If any Replica requests failed, we don't want to assume that missing
# articles are deleted.
# FIXME: A better approach would be to look for deletion logs, and only mark
# an article as deleted if there is a corresponding deletion log.
@deleted_page_ids = if @failed_request_count.zero?
articles.map(&:mw_page_id) - @synced_ids
else
[]
end
end

def article_data_from_replica(articles)
@failed_request_count = 0
synced_articles = Utils.chunk_requests(articles, 100) do |block|
request_results = Replica.new(@wiki).get_existing_articles_by_id block
@failed_request_count += 1 if request_results.nil?
request_results
end
synced_articles
end

def update_title_and_namespace(synced_articles)
# Update titles and namespaces based on mw_page_ids
synced_articles.each do |article_data|
article = @article || find_article_by_mw_page_id(article_data['page_id'])
next if data_matches_article?(article_data, article)

# FIXME: Workaround for four-byte unicode characters in article titles,
# until we fix the database to handle them.
# https://github.com/WikiEducationFoundation/WikiEduDashboard/issues/1744
# These titles are saved as their URL-encoded equivalents.
next if article.title[0] == '%'

begin
article.update!(title: article_data['page_title'],
namespace: article_data['page_namespace'],
deleted: false)
# Find corresponding Assignment records and update the titles
AssignmentUpdater.update_assignments_for_article(article)
rescue ActiveRecord::RecordNotUnique => e
# If we reach this point, it's most likely that @article has been set. This only
# happens when update_status is invoked with a single article, which probably indicates
# it was called from a cleanup script. In this case, we consider @article as
# a duplicate article record, so we re-process timeslices for it.

# If this is a duplicate article record, moving the revisions to the non-deleted
# copy should prevent it from being part of a future update.
# NOTE: ActiveRecord::RecordNotUnique is a subtype of ActiveRecord::StatementInvalid
# so this rescue comes first.
handle_undeletion(article)
Sentry.capture_exception e, level: 'warning'
rescue ActiveRecord::StatementInvalid => e # workaround for 4-byte unicode errors
Sentry.capture_exception e
end
end
end

def update_deleted_articles(articles)
return unless @failed_request_count.zero?
articles.each do |article|
next unless @deleted_page_ids.include? article.mw_page_id
# Reload to account for articles that have had their mw_page_id changed
# because the page was moved rather than deleted.
next unless @deleted_page_ids.include? article.reload.mw_page_id
article.update(deleted: true)
end
end

# If an article sets as deleted has a sync mw page id, then mark the timeslices
# as needs_update
def move_timeslices_for_deleted_articles(articles)
articles.filter(&:deleted).each do |article|
next unless @synced_ids.include? article.mw_page_id
handle_undeletion(article)
end
end

def handle_undeletion(article)
# If there's already a non-deleted copy, we need to reprocess the timeslices for article
nondeleted_article = Article.find_by(wiki_id: @wiki.id,
mw_page_id: article.mw_page_id, deleted: false)
# If there is only one copy of the article, it was already found and updated
# via `update_title_and_namespace`
return unless nondeleted_article
ArticlesCoursesCleanerTimeslice.reset_specific_articles(@course, [article])
end

def data_matches_article?(article_data, article)
return false unless article.title == article_data['page_title']
return false unless article.namespace == article_data['page_namespace'].to_i
# If article data is collected from Replica, the article is not currently deleted
return false if article.deleted
true
end

# Check whether any deleted pages still exist with a different article_id.
# If so, update the Article to use the new id.
def update_article_ids(deleted_page_ids)
maybe_deleted = Article.where(mw_page_id: deleted_page_ids, wiki_id: @wiki.id)
return if maybe_deleted.empty?
# These pages have titles that match Articles in our DB with deleted ids
request_results = Replica.new(@wiki).post_existing_articles_by_title maybe_deleted
@failed_request_count += 1 if request_results.nil?

# Update articles whose IDs have changed (keyed on title and namespace)
request_results&.each do |stp|
resolve_page_id(stp, deleted_page_ids)
end
end

def resolve_page_id(same_title_page, deleted_page_ids)
title = same_title_page['page_title']
mw_page_id = same_title_page['page_id']
namespace = same_title_page['page_namespace']

article = Article.find_by(wiki_id: @wiki.id, title:, namespace:, deleted: false)

return unless article_data_matches?(article, title, deleted_page_ids)
update_article_page_id(article, mw_page_id)
end

def article_data_matches?(article, title, deleted_page_ids)
return false if article.nil?
return false unless deleted_page_ids.include?(article.mw_page_id)
# This catches false positives when the query for page_title matches
# a case variant.
return false unless article.title == title
true
end

def update_article_page_id(article, mw_page_id)
if Article.exists?(mw_page_id:, wiki_id: @wiki.id)
# Catches case where update_constantly has
# already added this article under a new ID
article.update(deleted: true)
else
article.update(mw_page_id:)
end
end

def find_article_by_mw_page_id(mw_page_id)
article = Article.find_by(wiki_id: @wiki.id, mw_page_id:, deleted: false)
article ||= Article.find_by(wiki_id: @wiki.id, mw_page_id:)
article
end
end
Loading
Loading