import json import requests # for incorrectly indexed nov/dec 2021 stories # # NOTE: will match documents w/ correct canonical_domain! # total hits: 2543364 (before indexing any additional stories w/ correct can_dom) #qqq = 'original_url:"http\://mediacloud.org/need_canonical_url"' # total hits: 2543367 #qqq = 'canonical_domain:mediacloud.org' #qqq = 'canonical_domain:mediacloud.org AND NOT original_url:"http\://mediacloud.org/need_canonical_url"' # finds 3: # "original_url": "https://mediacloud.org/news/2017/11/30/how-the-indian-news-covered-the-2017-farmer-protests-a-quantitative-study", # "original_url": "https://core.mediacloud.org:443/urdunews/", # "original_url": "https://core.mediacloud.org:443/", # total hits: 2543364 w/ indexed_date between 2024-10-25 and 2024-10-28 #qqq = 'canonical_domain:mediacloud.org AND original_url:"http\://mediacloud.org/need_canonical_url"' # # total hits: 2543364 qqq = 'canonical_domain:mediacloud.org AND indexed_date:[2024-10-25 TO 2024-10-28]' # # hits sliced by indexed_date (which is full datetime, so can be sliced by hour): # 2024-10-25 819578 # 2024-10-26 618970 # 2024-10-27 1 # 2024-10-28 1104815 q = { "_source": [ "publication_date", "indexed_date", "canonical_domain", "url", "original_url" ], "query": { "query_string": { "default_field": "text_content", "default_operator": "AND", "query": qqq } }, "aggregations": { "dailycounts": { "date_histogram": { "field": "publication_date", "calendar_interval": "day", "min_doc_count": 1 } } }, "track_total_hits": True } # all effected records in mc_search-000004 index resp = requests.post("http://ramos.angwin:9200/mc_search-000004/_search", json=q) res = resp.json() print(json.dumps(res, indent=1))