WikiEducationFoundation · gabina · Jan 6, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/app/models/course.rb b/app/models/course.rb
@@ -332,6 +332,16 @@ def category_article_ids
     categories.inject([]) { |ids, cat| ids + cat.article_ids }
   end
 
+  # Retrieve articles based on the existing article course timeslices.
+  # This includes both tracked and untracked articles, such as those that
+  # don't belong to a tracked namespace.
+  def articles_from_timeslices(wiki_id)
+    Article.joins(:article_course_timeslices)
+           .where(article_course_timeslices: { course_id: id })
+           .where(wiki_id:)
+           .distinct
+  end
+
   def update_until
     self.end + UPDATE_LENGTH
   end

diff --git a/app/models/wiki_content/article.rb b/app/models/wiki_content/article.rb
@@ -28,6 +28,7 @@ class Article < ApplicationRecord
   has_many :revisions
   has_many :editors, through: :revisions, source: :user
   has_many :articles_courses, class_name: 'ArticlesCourses'
+  has_many :article_course_timeslices
   has_many :courses, -> { distinct }, through: :articles_courses
   has_many :assignments
   belongs_to :wiki

diff --git a/app/services/update_course_stats_timeslice.rb b/app/services/update_course_stats_timeslice.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 require_dependency "#{Rails.root}/lib/course_revision_updater"
-require_dependency "#{Rails.root}/lib/article_status_manager"
+require_dependency "#{Rails.root}/lib/article_status_manager_timeslice"
 require_dependency "#{Rails.root}/lib/importers/course_upload_importer"
 require_dependency "#{Rails.root}/lib/data_cycle/update_logger"
 require_dependency "#{Rails.root}/lib/analytics/histogram_plotter"
@@ -28,8 +28,8 @@ def initialize(course)
     @start_time = Time.zone.now
     import_uploads
     update_categories
-    @timeslice_errors = UpdateCourseWikiTimeslices.new(@course).run(all_time: @full_update)
     update_article_status if should_update_article_status?
+    @timeslice_errors = UpdateCourseWikiTimeslices.new(@course).run(all_time: @full_update)
     update_average_pageviews
     update_caches
     update_wikidata_stats if wikidata
@@ -55,11 +55,8 @@ def update_categories
   end
 
   def update_article_status
-    # TODO: note this is not wiki scoped.
-    ArticleStatusManager.update_article_status_for_course(@course)
+    ArticleStatusManagerTimeslice.update_article_status_for_course(@course)
     @debugger.log_update_progress :article_status_updated
-
-    # TODO: replace the logic on ModifiedRevisionsManager.new(@wiki).move_or_delete_revisions
   end
 
   def update_average_pageviews

diff --git a/app/services/update_course_wiki_timeslices.rb b/app/services/update_course_wiki_timeslices.rb
@@ -129,10 +129,10 @@ def update_article_course_timeslices_for_wiki(revisions)
     revs = revisions[:revisions]
     revs.group_by(&:article_id).each do |article_id, article_revisions|
       # We don't create articles courses for every article
-      article_course = ArticlesCourses.find_by(course: @course, article_id:)
-      next unless article_course
+      # article_course = ArticlesCourses.find_by(course: @course, article_id:)
+      # next unless article_course
 
-      # Update cache for ArticleCorseTimeslice
+      # Update cache for ArticleCourseTimeslice
       article_revisions_data = { start: start_period, end: end_period,
                                  revisions: article_revisions }
       ArticleCourseTimeslice.update_article_course_timeslices(@course, article_id,

diff --git a/app/services/update_timeslices_course_user.rb b/app/services/update_timeslices_course_user.rb
@@ -37,26 +37,19 @@ def add_user_ids(user_ids)
     return if user_ids.empty?
 
     @course.wikis.each do |wiki|
-      fetch_users_revisions_for_wiki(wiki, user_ids, @course.start, @course.end)
+      fetch_users_revisions_for_wiki(wiki, user_ids)
     end
   end
 
-  def fetch_users_revisions_for_wiki(wiki, user_ids, first_start, latest_start)
-    current_start = first_start
+  def fetch_users_revisions_for_wiki(wiki, user_ids)
     users = User.find(user_ids)
 
     manager = RevisionDataManager.new(wiki, @course)
-    current_end = latest_start + @timeslice_manager.timeslice_duration(wiki)
     # Fetch the revisions for users for the complete period
     revisions = manager.fetch_revision_data_for_users(users,
-                                                      current_start.strftime('%Y%m%d%H%M%S'),
-                                                      current_end.strftime('%Y%m%d%H%M%S'))
-    wikis_and_starts = revisions_to_wiki_and_start_dates(revisions,
-                                                         wiki,
-                                                         first_start,
-                                                         latest_start)
-
-    @timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
+                                                      @course.start.strftime('%Y%m%d%H%M%S'),
+                                                      @course.end.strftime('%Y%m%d%H%M%S'))
+    @timeslice_manager.update_timeslices_that_need_update_from_revisions(revisions, wiki)
   end
 
   def remove_courses_users(user_ids)
@@ -67,10 +60,9 @@ def remove_courses_users(user_ids)
     # Do this to avoid running the query twice
     @article_course_timeslices_for_users = get_article_course_timeslices_for_users(user_ids)
     # Mark course wiki timeslices that needs to be re-proccesed
-    wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(
+    @timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(
       @article_course_timeslices_for_users
     )
-    @timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
     # Clean articles courses timeslices
     clean_article_course_timeslices
     # Delete articles courses that were updated only for removed users
@@ -87,23 +79,6 @@ def get_article_course_timeslices_for_users(user_ids)
     timeslices.flatten
   end
 
-  def revisions_to_wiki_and_start_dates(revisions, wiki, first_start, latest_start)
-    tuples = []
-    current_start = first_start
-    while current_start <= latest_start
-      current_end = current_start + @timeslice_manager.timeslice_duration(wiki)
-      revisions_per_timeslice = revisions.select do |r|
-        current_start <= r.date && r.date < current_end
-      end
-      unless revisions_per_timeslice.empty?
-        tuples += [[wiki.id,
-                    current_start.strftime('%Y%m%d%H%M%S')]]
-      end
-      current_start += @timeslice_manager.timeslice_duration(wiki)
-    end
-    tuples
-  end
-
   def clean_article_course_timeslices
     ids = @article_course_timeslices_for_users.map(&:id)
     ArticleCourseTimeslice.where(id: ids).update_all(character_sum: 0, # rubocop:disable Rails/SkipsModelValidations

diff --git a/app/services/update_timeslices_untracked_article.rb b/app/services/update_timeslices_untracked_article.rb
@@ -29,8 +29,7 @@ def untrack(article_ids)
     # Only reprocess those non-empty timeslices
     non_empty = @article_course_timeslices_to_untrack.where.not(user_ids: nil)
     # Mark course wiki timeslices that needs to be re-proccesed
-    wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(non_empty)
-    @timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
+    @timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(non_empty)
     untrack_timeslices
   end
 
@@ -53,8 +52,7 @@ def retrack(article_ids)
     non_empty = @article_course_timeslices_to_retrack.where.not(user_ids: nil)
 
     # Mark course wiki timeslices that needs to be re-proccesed
-    wikis_and_starts = @timeslice_manager.get_wiki_and_start_dates_to_reprocess(non_empty)
-    @timeslice_manager.update_course_wiki_timeslices_that_need_update(wikis_and_starts)
+    @timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(non_empty)
     retrack_timeslices
   end
 

diff --git a/lib/article_status_manager_timeslice.rb b/lib/article_status_manager_timeslice.rb
@@ -0,0 +1,229 @@
+# frozen_string_literal: true
+
+require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice"
+require_dependency "#{Rails.root}/lib/assignment_updater"
+
+#= Updates articles to reflect deletion and page moves on Wikipedia
+# This class is responsible for two main things (we may separate it in the future).
+# - Iterate over article course timeslices to sync articles. This updates title,
+# namespace, deleted and mw_page_id fields.
+# - Reset articles according to their new status:
+#   * Articles that were deleted or untracked. These are articles that were either
+#   deleted or moved to a namespace not traceable by the course. Such articles
+#   should be excluded from course statistics.
+#   * Articles that were restored or re-tracked: These are articles that were
+#   either undeleted or moved to a namespace relevant to the course. Such articles
+#   should be included in course statistics.
+#   TODO: this class can probably be made simpler
+
+class ArticleStatusManagerTimeslice
+  def initialize(course, wiki = nil)
+    @course = course
+    @wiki = wiki || Wiki.default_wiki
+  end
+
+  ################
+  # Entry points #
+  ################
+
+  def self.update_article_status_for_course(course)
+    course.wikis.each do |wiki|
+      # Retrieve articles based on ac timeslices to also include current untracked articles.
+      course.articles_from_timeslices(wiki.id)
+            # Updating only those articles which are updated more than 1 day ago
+            .where('articles.updated_at < ?', 1.day.ago)
+            .in_batches do |article_batch|
+        # Using in_batches so that the update_at of all articles in the batch can be
+        # excuted in a single query, otherwise if we use find_in_batches, query for
+        # each article for updating the same would be required
+        new(course, wiki).update_status(article_batch)
+        # rubocop:disable Rails/SkipsModelValidations
+        article_batch.touch_all(:updated_at)
+        # rubocop:enable Rails/SkipsModelValidations
+      end
+    end
+
+    ArticlesCoursesCleanerTimeslice.reset_articles_for_course(course)
+  end
+
+  ####################
+  # Per-wiki methods #
+  ####################
+
+  def update_status(articles)
+    # This is used for problem cases where articles can't be easily disambiguated
+    # because of duplicate records with the same mw_page_id. It's only used if
+    # `articles` is just one record.
+    @article = articles.first if articles.one?
+
+    identify_deleted_and_synced_page_ids(articles)
+
+    # First we find any pages that just moved, and update title and namespace.
+    update_title_and_namespace @synced_articles
+
+    # Now we check for pages that have changed mw_page_ids.
+    # This happens in situations such as history merges.
+    # If articles move in between title/namespace updates and mw_page_id updates,
+    # then it's possible to end up with inconsistent data.
+    update_article_ids @deleted_page_ids
+
+    # Mark articles as deleted as appropriate
+    update_deleted_articles(articles)
+    move_timeslices_for_deleted_articles(articles)
+  end
+
+  ##################
+  # Helper methods #
+  ##################
+  private
+
+  def identify_deleted_and_synced_page_ids(articles)
+    @synced_articles = article_data_from_replica(articles)
+    @synced_ids = @synced_articles.map { |a| a['page_id'].to_i }
+
+    # If any Replica requests failed, we don't want to assume that missing
+    # articles are deleted.
+    # FIXME: A better approach would be to look for deletion logs, and only mark
+    # an article as deleted if there is a corresponding deletion log.
+    @deleted_page_ids = if @failed_request_count.zero?
+                          articles.map(&:mw_page_id) - @synced_ids
+                        else
+                          []
+                        end
+  end
+
+  def article_data_from_replica(articles)
+    @failed_request_count = 0
+    synced_articles = Utils.chunk_requests(articles, 100) do |block|
+      request_results = Replica.new(@wiki).get_existing_articles_by_id block
+      @failed_request_count += 1 if request_results.nil?
+      request_results
+    end
+    synced_articles
+  end
+
+  def update_title_and_namespace(synced_articles)
+    # Update titles and namespaces based on mw_page_ids
+    synced_articles.each do |article_data|
+      article = @article || find_article_by_mw_page_id(article_data['page_id'])
+      next if data_matches_article?(article_data, article)
+
+      # FIXME: Workaround for four-byte unicode characters in article titles,
+      # until we fix the database to handle them.
+      # https://github.com/WikiEducationFoundation/WikiEduDashboard/issues/1744
+      # These titles are saved as their URL-encoded equivalents.
+      next if article.title[0] == '%'
+
+      begin
+        article.update!(title: article_data['page_title'],
+                        namespace: article_data['page_namespace'],
+                        deleted: false)
+        # Find corresponding Assignment records and update the titles
+        AssignmentUpdater.update_assignments_for_article(article)
+      rescue ActiveRecord::RecordNotUnique => e
+        # If we reach this point, it's most likely that @article has been set. This only
+        # happens when update_status is invoked with a single article, which probably indicates
+        # it was called from a cleanup script. In this case, we consider @article as
+        # a duplicate article record, so we re-process timeslices for it.
+
+        # If this is a duplicate article record, moving the revisions to the non-deleted
+        # copy should prevent it from being part of a future update.
+        # NOTE: ActiveRecord::RecordNotUnique is a subtype of ActiveRecord::StatementInvalid
+        # so this rescue comes first.
+        handle_undeletion(article)
+        Sentry.capture_exception e, level: 'warning'
+      rescue ActiveRecord::StatementInvalid => e # workaround for 4-byte unicode errors
+        Sentry.capture_exception e
+      end
+    end
+  end
+
+  def update_deleted_articles(articles)
+    return unless @failed_request_count.zero?
+    articles.each do |article|
+      next unless @deleted_page_ids.include? article.mw_page_id
+      # Reload to account for articles that have had their mw_page_id changed
+      # because the page was moved rather than deleted.
+      next unless @deleted_page_ids.include? article.reload.mw_page_id
+      article.update(deleted: true)
+    end
+  end
+
+  # If an article sets as deleted has a sync mw page id, then mark the timeslices
+  # as needs_update
+  def move_timeslices_for_deleted_articles(articles)
+    articles.filter(&:deleted).each do |article|
+      next unless @synced_ids.include? article.mw_page_id
+      handle_undeletion(article)
+    end
+  end
+
+  def handle_undeletion(article)
+    # If there's already a non-deleted copy, we need to reprocess the timeslices for article
+    nondeleted_article = Article.find_by(wiki_id: @wiki.id,
+                                         mw_page_id: article.mw_page_id, deleted: false)
+    # If there is only one copy of the article, it was already found and updated
+    # via `update_title_and_namespace`
+    return unless nondeleted_article
+    ArticlesCoursesCleanerTimeslice.reset_specific_articles(@course, [article])
+  end
+
+  def data_matches_article?(article_data, article)
+    return false unless article.title == article_data['page_title']
+    return false unless article.namespace == article_data['page_namespace'].to_i
+    # If article data is collected from Replica, the article is not currently deleted
+    return false if article.deleted
+    true
+  end
+
+  # Check whether any deleted pages still exist with a different article_id.
+  # If so, update the Article to use the new id.
+  def update_article_ids(deleted_page_ids)
+    maybe_deleted = Article.where(mw_page_id: deleted_page_ids, wiki_id: @wiki.id)
+    return if maybe_deleted.empty?
+    # These pages have titles that match Articles in our DB with deleted ids
+    request_results = Replica.new(@wiki).post_existing_articles_by_title maybe_deleted
+    @failed_request_count += 1 if request_results.nil?
+
+    # Update articles whose IDs have changed (keyed on title and namespace)
+    request_results&.each do |stp|
+      resolve_page_id(stp, deleted_page_ids)
+    end
+  end
+
+  def resolve_page_id(same_title_page, deleted_page_ids)
+    title = same_title_page['page_title']
+    mw_page_id = same_title_page['page_id']
+    namespace = same_title_page['page_namespace']
+
+    article = Article.find_by(wiki_id: @wiki.id, title:, namespace:, deleted: false)
+
+    return unless article_data_matches?(article, title, deleted_page_ids)
+    update_article_page_id(article, mw_page_id)
+  end
+
+  def article_data_matches?(article, title, deleted_page_ids)
+    return false if article.nil?
+    return false unless deleted_page_ids.include?(article.mw_page_id)
+    # This catches false positives when the query for page_title matches
+    # a case variant.
+    return false unless article.title == title
+    true
+  end
+
+  def update_article_page_id(article, mw_page_id)
+    if Article.exists?(mw_page_id:, wiki_id: @wiki.id)
+      # Catches case where update_constantly has
+      # already added this article under a new ID
+      article.update(deleted: true)
+    else
+      article.update(mw_page_id:)
+    end
+  end
+
+  def find_article_by_mw_page_id(mw_page_id)
+    article = Article.find_by(wiki_id: @wiki.id, mw_page_id:, deleted: false)
+    article ||= Article.find_by(wiki_id: @wiki.id, mw_page_id:)
+    article
+  end
+end