From 197be95f664d818524041b12a213a302cd05cfbb Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 24 Mar 2015 12:41:58 -0400 Subject: [PATCH 01/24] Add SUN, DDR, CSK to country list --- postprocess.py | 55 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/postprocess.py b/postprocess.py index 09d8943..54fefaa 100644 --- a/postprocess.py +++ b/postprocess.py @@ -272,33 +272,34 @@ def process_actors(event): one of GOV, MIL, REB, OPP, PTY, COP, JUD, SPY, MED, EDU, BUS, CRM, or CVL. The ``others`` contains all other actor or agent codes. """ - countries = ('ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG', - 'ARM', 'ASM', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', - 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', - 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', - 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', - 'CPV', 'CRI', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', - 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', - 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', - 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', - 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', - 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', - 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', - 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', - 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', - 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', - 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', - 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', 'NCL', 'NER', - 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', - 'OMN', 'PAK', 'PAN', 'PCN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', - 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'REU', 'ROU', - 'RUS', 'RWA', 'SAU', 'SCG', 'SDN', 'SEN', 'SGP', 'SHN', 'SJM', - 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', - 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', - 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', 'TTO', - 'TUN', 'TUR', 'TWN', 'TUV', 'TZA', 'UGA', 'UKR', 'URY', 'USA', - 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', - 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE', 'ATG', 'AUS') + countries = ('ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', + 'ARG', 'ARM', 'ASM', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', + 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', + 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', + 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', + 'COM', 'CPV', 'CRI', 'CSK', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', + 'DDR', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', + 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', + 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', + 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', + 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', + 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', + 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', + 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', + 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', + 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', + 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', + 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', + 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', 'PHL', + 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', + 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', 'SCG', 'SDN', 'SEN', + 'SGP', 'SHN', 'SJM', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', + 'SRB', 'SSD', 'SUN', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', + 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKL', + 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TWN', 'TUV', 'TZA', + 'UGA', 'UKR', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', + 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE', + 'ATG', 'AUS') root_actors = ('IGO', 'NGO', 'IMG', 'MNC') primary_agent = ('GOV', 'MIL', 'REB', 'OPP', 'PTY', 'COP', 'JUD', 'SPY', 'MED', 'EDU', 'BUS', 'CRM', 'CVL') From ecb3a96f6929b21ea73fcbf6d4c508e8b517a255 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 24 Mar 2015 12:56:21 -0400 Subject: [PATCH 02/24] Add more country codes, KSV + historical --- postprocess.py | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/postprocess.py b/postprocess.py index 54fefaa..e0e3890 100644 --- a/postprocess.py +++ b/postprocess.py @@ -275,31 +275,31 @@ def process_actors(event): countries = ('ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ASM', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', - 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', - 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', - 'COM', 'CPV', 'CRI', 'CSK', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', - 'DDR', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', - 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', - 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', - 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', - 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', - 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', - 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', - 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', - 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', - 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', - 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', - 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', - 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', 'PHL', - 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', - 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', 'SCG', 'SDN', 'SEN', - 'SGP', 'SHN', 'SJM', 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', - 'SRB', 'SSD', 'SUN', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', - 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', 'TJK', 'TKL', - 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', 'TWN', 'TUV', 'TZA', - 'UGA', 'UKR', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', - 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE', - 'ATG', 'AUS') + 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BYS', 'BWA', 'CAF', + 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', + 'COL', 'COM', 'CPV', 'CRI', 'CSK', 'CUB', 'CUW', 'CYM', 'CYP', + 'CZE', 'DDR', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', + 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', + 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', + 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', + 'GUF', 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', + 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', + 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', + 'KOR', 'KSV', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', + 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', + 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', + 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', + 'MYS', 'MYT', 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', + 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', + 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', + 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', 'SCG', + 'SDN', 'SEN', 'SGP', 'SHN', 'SJM', 'SLB', 'SLE', 'SLV', 'SMR', + 'SOM', 'SPM', 'SRB', 'SSD', 'SUN', 'STP', 'SUR', 'SVK', 'SVN', + 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', + 'TJK', 'TMP', 'TKL', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', + 'TWN', 'TUV', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 'UZB', 'VAT', + 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', + 'YMD', 'YUG', 'ZAF', 'ZAR', 'ZMB', 'ZWE', 'ATG', 'AUS') root_actors = ('IGO', 'NGO', 'IMG', 'MNC') primary_agent = ('GOV', 'MIL', 'REB', 'OPP', 'PTY', 'COP', 'JUD', 'SPY', 'MED', 'EDU', 'BUS', 'CRM', 'CVL') From d42cdf34556c7de293f15104ced6ac91c72f4188 Mon Sep 17 00:00:00 2001 From: Parham Negahdar Date: Thu, 30 Apr 2015 13:26:09 -0400 Subject: [PATCH 03/24] Minor Fixes: Add db host, editable petarch --- PHOX_config.ini | 1 + requirements.txt | 2 +- scraper_connection.py | 2 +- utilities.py | 17 +++++++++++++---- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/PHOX_config.ini b/PHOX_config.ini index 1d7fd0c..435b006 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -24,3 +24,4 @@ oneaday_filter = True #auth_db = db_name #auth_user = username #auth_pass = password +#db_host = 127.0.0.1 diff --git a/requirements.txt b/requirements.txt index 8afc297..4a2394b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ Unidecode==0.04.16 corenlp-python==3.2.0-3 docutils==0.11 numpydoc==0.4 -petrarch==0.01a +-e git+https://github.com/openeventdata/petrarch.git@5fe92b676e8b4fb9a1964e7da55e9c95e9a5745f#egg=petrarch-master pexpect==3.2 pymongo==2.7 python-dateutil==2.2 diff --git a/scraper_connection.py b/scraper_connection.py index 331fa0d..f599e04 100644 --- a/scraper_connection.py +++ b/scraper_connection.py @@ -146,7 +146,7 @@ def main(current_date, file_details, write_file=False, file_stem=None): """ sources = _get_sources('source_keys.txt') conn = utilities.make_conn(file_details.auth_db, file_details.auth_user, - file_details.auth_pass) + file_details.auth_pass, file_details.db_host) less_than = datetime.datetime(current_date.year, current_date.month, current_date.day) diff --git a/utilities.py b/utilities.py index f9b5c5e..ec1f54f 100644 --- a/utilities.py +++ b/utilities.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import re import logging +import os from collections import namedtuple + from pymongo import MongoClient try: @@ -66,10 +68,12 @@ def parse_config(config_filename): auth_db = parser.get('Auth', 'auth_db') auth_user = parser.get('Auth', 'auth_user') auth_pass = parser.get('Auth', 'auth_pass') + db_host = parser.get('Auth', 'db_host') else: auth_db = '' auth_user = '' auth_pass = '' + db_host = os.getenv('MONGO_HOST') or None if 'Logging' in parser.sections(): log_file = parser.get('Logging', 'log_file') else: @@ -85,12 +89,13 @@ def parse_config(config_filename): 'log_file', 'auth_db', 'auth_user', - 'auth_pass']) + 'auth_pass', + 'db_host']) file_list = file_attrs(scraper_stem, recordfile_stem, fullfile_stem, eventfile_stem, dupfile_stem, outputfile_stem, oneaday_filter, log_file, auth_db, auth_user, - auth_pass) + auth_pass, db_host) return server_list, file_list except Exception as e: @@ -134,7 +139,7 @@ def do_RuntimeError(st1, filename='', st2=''): raise RuntimeError(st1 + ' ' + filename + ' ' + st2) -def make_conn(db_auth, db_user, db_pass): +def make_conn(db_auth, db_user, db_pass, db_host=None): """ Function to establish a connection to a local MonoDB instance. @@ -158,7 +163,11 @@ def make_conn(db_auth, db_user, db_pass): Collection within MongoDB that holds the scraped news stories. """ - client = MongoClient() + + if db_host: + client = MongoClient(db_host) + else: + client = MongoClient() if db_auth: client[db_auth].authenticate(db_user, db_pass) database = client.event_scrape From a27bd5a300ba8e26c2e3ffd13b3e22e3bbd0ebca Mon Sep 17 00:00:00 2001 From: Parham Negahdar Date: Thu, 30 Apr 2015 13:34:53 -0400 Subject: [PATCH 04/24] Add scheduler --- pipeline.py | 15 +++++++++------ pipeline_sched.py | 9 +++++++++ requirements.txt | 17 ++++++++++------- 3 files changed, 28 insertions(+), 13 deletions(-) create mode 100644 pipeline_sched.py diff --git a/pipeline.py b/pipeline.py index d2cd443..7aa819d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -84,10 +84,10 @@ def main(file_details, server_details, logger_file=None, run_filter=None, if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') - #Command to write output to a file directly from PETR -# petrarch.run_pipeline(formatted, -# '{}{}.txt'.format(file_details.fullfile_stem, -# date_string), parsed=True) + # Command to write output to a file directly from PETR + # petrarch.run_pipeline(formatted, + # '{}{}.txt'.format(file_details.fullfile_stem, + # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': @@ -131,9 +131,12 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print('PHOX.pipeline end:', datetime.datetime.utcnow()) -if __name__ == '__main__': - # initialize the various utilities globals +def run(): server_details, file_details = utilities.parse_config('PHOX_config.ini') main(file_details, server_details, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') + + +if __name__ == '__main__': + run() diff --git a/pipeline_sched.py b/pipeline_sched.py new file mode 100644 index 0000000..1d3804d --- /dev/null +++ b/pipeline_sched.py @@ -0,0 +1,9 @@ +from pipeline import run + +from apscheduler.schedulers.blocking import BlockingScheduler +from apscheduler.triggers.cron import CronTrigger + +if __name__ == '__main__': + scheduler = BlockingScheduler() + scheduler.add_job(run, CronTrigger(minute=0)) + scheduler.start() diff --git a/requirements.txt b/requirements.txt index 4a2394b..8c9ff67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,20 @@ +APScheduler==3.0.3 +corenlp-python==3.2.0.post3 +docutils==0.11 +futures==2.2.0 Jinja2==2.7.2 MarkupSafe==0.23 -PyYAML==3.11 -Pygments==1.6 -Sphinx==1.2.2 -Unidecode==0.04.16 -corenlp-python==3.2.0-3 -docutils==0.11 numpydoc==0.4 -e git+https://github.com/openeventdata/petrarch.git@5fe92b676e8b4fb9a1964e7da55e9c95e9a5745f#egg=petrarch-master pexpect==3.2 +Pygments==1.6 pymongo==2.7 python-dateutil==2.2 +pytz==2015.2 +PyYAML==3.11 requests==2.3.0 six==1.6.1 -wsgiref==0.1.2 +Sphinx==1.2.2 +tzlocal==1.1.3 +Unidecode==0.4.16 xmltodict==0.9.0 From 68692a336631f1441f104ca89e0decd1afb61a74 Mon Sep 17 00:00:00 2001 From: Parham Negahdar Date: Thu, 30 Apr 2015 14:09:25 -0400 Subject: [PATCH 05/24] Fix schedule --- pipeline_sched.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_sched.py b/pipeline_sched.py index 1d3804d..35c4850 100644 --- a/pipeline_sched.py +++ b/pipeline_sched.py @@ -5,5 +5,5 @@ if __name__ == '__main__': scheduler = BlockingScheduler() - scheduler.add_job(run, CronTrigger(minute=0)) + scheduler.add_job(run, CronTrigger(minute=0, hour=1)) scheduler.start() From 91f9b0bc58c365584df4739d13bc8580b2bd024e Mon Sep 17 00:00:00 2001 From: The Gitter Badger Date: Sun, 6 Mar 2016 18:56:02 +0000 Subject: [PATCH 06/24] Add Gitter badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2659250..6e0baaa 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ phoenix_pipeline ================ +[![Join the chat at https://gitter.im/openeventdata/phoenix_pipeline](https://badges.gitter.im/openeventdata/phoenix_pipeline.svg)](https://gitter.im/openeventdata/phoenix_pipeline?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + Turning news into events since 2014. This system links a series of Python programs to convert the files which have been From a06c4cbee09913a888ad03d1cc3850fe7922ac7d Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Mon, 27 Jun 2016 10:38:36 -0500 Subject: [PATCH 07/24] Petrarch2 option from config, closes #90 --- PHOX_config.ini | 3 +++ pipeline.py | 15 ++++++++++++--- utilities.py | 8 +++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/PHOX_config.ini b/PHOX_config.ini index 435b006..1f0396f 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -17,6 +17,9 @@ newsourcestem = newsources. oneaday_filter = True +[Petrarch] +petrarch_version = 1 + #[Logging] #log_file = /root/logs/pipeline.log diff --git a/pipeline.py b/pipeline.py index 7aa819d..52440c1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,7 +11,18 @@ import oneaday_filter import result_formatter import scraper_connection -from petrarch import petrarch +#from petrarch2 import petrarch2 + +server_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') +if petrarch_version == '1': + from petrarch import petrarch + print("Using original Petrarch version") +elif petrarch_version == '2': + from petrarch2 import petrarch2 as petrarch + print("Using Petrarch2") +else: + print("Invalid Petrarch version. Argument must be '1' or '2'") + def main(file_details, server_details, logger_file=None, run_filter=None, @@ -132,8 +143,6 @@ def main(file_details, server_details, logger_file=None, run_filter=None, def run(): - server_details, file_details = utilities.parse_config('PHOX_config.ini') - main(file_details, server_details, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') diff --git a/utilities.py b/utilities.py index ec1f54f..7a63dda 100644 --- a/utilities.py +++ b/utilities.py @@ -34,6 +34,9 @@ def parse_config(config_filename): file_list: Named tuple. All the other config information not in ``server_list``. + + petrarch_version: Int + Either 1 or 2, indicating whether Petrarch or Petrarch2 should be used. """ parser = ConfigParser() parser.read(config_filename) @@ -79,6 +82,9 @@ def parse_config(config_filename): else: log_file = '' + petrarch_version = parser.get('Petrarch', 'petrarch_version') + print("petrarch version is {}".format(petrarch_version)) + file_attrs = namedtuple('FileAttributes', ['scraper_stem', 'recordfile_stem', 'fullfile_stem', @@ -97,7 +103,7 @@ def parse_config(config_filename): oneaday_filter, log_file, auth_db, auth_user, auth_pass, db_host) - return server_list, file_list + return server_list, file_list, petrarch_version except Exception as e: print('Problem parsing config file. {}'.format(e)) From cf916f644cfee85d365f40206e0dd2b260004ad6 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 12:39:12 -0500 Subject: [PATCH 08/24] Add Mordecai integration --- PHOX_config.ini | 9 ++++-- geolocation.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++--- pipeline.py | 17 ++++++---- postprocess.py | 15 +++++++-- utilities.py | 31 +++++++++++++----- 5 files changed, 134 insertions(+), 22 deletions(-) diff --git a/PHOX_config.ini b/PHOX_config.ini index 1f0396f..6807dc7 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -3,8 +3,13 @@ server_name = username = password = server_dir = public_html/datasets/phoenix/ -cliff_host = localhost + +[Geolocation] +geo_service = Mordecai +cliff_host = http://localhost cliff_port = 8999 +mordecai_host = http://localhost +mordecai_port = 5011 [Pipeline] scraper_stem = scraper_results_ @@ -18,7 +23,7 @@ newsourcestem = newsources. oneaday_filter = True [Petrarch] -petrarch_version = 1 +petrarch_version = 2 #[Logging] #log_file = /root/logs/pipeline.log diff --git a/geolocation.py b/geolocation.py index 4a28312..ba54118 100644 --- a/geolocation.py +++ b/geolocation.py @@ -4,6 +4,7 @@ import requests import utilities from bson.objectid import ObjectId +import json def query_cliff(sentence, host, port): """ @@ -230,12 +231,84 @@ def iso_convert(iso2c): iso3c = "NA" return iso3c +def query_mordecai(sentence, host, port): + """ + Takes a sentence from a news article, passes it to the Mordecai geolocation + service, and extracts the relevant data that Mordecai returns. + Parameters + ---------- + sentence: String. + Text from which an event was coded. + Returns + ------- + lat: String. + Latitude of a location. + lon: String. + Longitude of a location. + placeName: String. + The name of the most precise location extracted from the sentence. + stateName: String. + The name of the state/region/province extracted from the sentence. + countryCode: String. + The ISO 3 character country code of the country extracted from the sentence. + """ + headers = {'Content-Type': 'application/json'} + data = {'text': sentence} + data = json.dumps(data) + dest = "{0}:{1}/places".format(host, port) + out = requests.post(dest, data=data, headers=headers) + return json.loads(out.text) + + +def mordecai(events, file_details, server_details, geo_details): + """ + Pulls out a database ID and queries the Mordecai geolocation system + running locally and find location information within the sentence. + Parameters + ---------- + events: Dictionary. + Contains filtered events from the one-a-day filter. Keys are + (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of + IDs, sources, and issues. + Returns + ------- + events: Dictionary. + Same as in the parameter but with the addition of a value that is + a list of lon, lat, placeName, stateName, countryCode. + """ + coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, + file_details.auth_pass) + + for event in events: + event_id, sentence_id = events[event]['ids'][0].split('_') + # print(event_id) + result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) + sents = utilities.sentence_segmenter(result['content']) + + query_text = sents[int(sentence_id)] + geo_info = query_mordecai(query_text, geo_details.mordecai_host, + geo_details.mordecai_port) + print(geo_info) + try: + # temporary hack: take the first location: + geo_info = geo_info[0] + # NA is for ADM1, which mord doesn't return. See issue #2 + events[event]['geo'] = (geo_info['lon'], geo_info['lat'], + geo_info['placename'], "NA", geo_info['countrycode']) + print("worked") + except Exception as e: + print("error") + print(e) + events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA") + + return events -def main(events, file_details, server_details): +def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's - CLIFF/CLAVIN geolocation system running locally and find location - information within the sentence. + CLIFF/CLAVIN geolocation system running locally and find location + information within the sentence. Note, this function calls back to the database + where stories are stored. Parameters ---------- events: Dictionary. @@ -252,14 +325,15 @@ def main(events, file_details, server_details): file_details.auth_pass) for event in events: + print(event) event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] - geo_info = query_cliff(query_text, server_details.cliff_host, - server_details.cliff_port) + geo_info = query_cliff(query_text, geo_details.cliff_host, + geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": diff --git a/pipeline.py b/pipeline.py index 52440c1..d8a8079 100644 --- a/pipeline.py +++ b/pipeline.py @@ -13,7 +13,7 @@ import scraper_connection #from petrarch2 import petrarch2 -server_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') +server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') if petrarch_version == '1': from petrarch import petrarch print("Using original Petrarch version") @@ -25,7 +25,7 @@ -def main(file_details, server_details, logger_file=None, run_filter=None, +def main(file_details, geo_details, server_details, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. @@ -36,6 +36,9 @@ def main(file_details, server_details, logger_file=None, run_filter=None, file_details: Named tuple. All the other config information not in ``server_details``. + geo_details: Named tuple. + Settings for geocoding. + server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. @@ -77,7 +80,10 @@ def main(file_details, server_details, logger_file=None, run_filter=None, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) - + print("process date:") + print(process_date) + print("file_details") + print(file_details) results, scraperfilename = scraper_connection.main(process_date, file_details) @@ -89,7 +95,6 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) - logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': @@ -125,7 +130,7 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, - server_details) + server_details, geo_details) else: print("Please specify a data version number. Program ending.") @@ -143,7 +148,7 @@ def main(file_details, server_details, logger_file=None, run_filter=None, def run(): - main(file_details, server_details, file_details.log_file, + main(file_details, geo_details, server_details, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') diff --git a/postprocess.py b/postprocess.py index e0e3890..bdcca85 100644 --- a/postprocess.py +++ b/postprocess.py @@ -358,7 +358,7 @@ def process_actors(event): return actors -def main(event_dict, this_date, version, file_details, server_details): +def main(event_dict, this_date, version, file_details, server_details, geo_details): """ Pulls in the coded results from PETRARCH dictionary in the {StoryID: [(record), (record)]} format and allows only one unique @@ -381,12 +381,23 @@ def main(event_dict, this_date, version, file_details, server_details): file_details: NamedTuple. Container generated from the config file specifying file stems and other relevant options. + server_details: NamedTuple. + Info for uploading to server. + geo_details: NamedTuple. + Info about geo type and geo server details. """ logger = logging.getLogger('pipeline_log') logger.info('Geolocating.') print('Geolocating') - updated_events = geolocation.main(event_dict, file_details, server_details) + geo_details.geo_service == "Mordecai" + print(event_dict) + if geo_details.geo_service == "CLIFF": + updated_events = geolocation.cliff(event_dict, file_details, server_details, geo_details) + elif geo_details.geo_service == "Mordecai": + updated_events = geolocation.mordecai(event_dict, file_details, server_details, geo_details) + else: + print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'.") logger.info('Formatting events for output.') event_write = create_strings(updated_events, version) diff --git a/utilities.py b/utilities.py index 7a63dda..dc6e3b0 100644 --- a/utilities.py +++ b/utilities.py @@ -32,6 +32,9 @@ def parse_config(config_filename): Config information specifically related to the remote server for FTP uploading. + geo_list : Named tuple. + Config information for geocoding. + file_list: Named tuple. All the other config information not in ``server_list``. @@ -47,17 +50,31 @@ def parse_config(config_filename): username = parser.get('Server', 'username') password = parser.get('Server', 'password') server_dir = parser.get('Server', 'server_dir') - cliff_host = parser.get('Server', 'cliff_host') - cliff_port = parser.get('Server', 'cliff_port') server_attrs = namedtuple('ServerAttributes', ['serv_name', 'username', 'password', - 'server_dir', - 'cliff_host', - 'cliff_port']) + 'server_dir']) server_list = server_attrs(serv_name, username, password, - server_dir, cliff_host, cliff_port) + server_dir) + + geo_service = parser.get('Geolocation', 'geo_service') + cliff_host = parser.get('Geolocation', 'cliff_host') + cliff_port = parser.get('Geolocation', 'cliff_port') + mordecai_host = parser.get('Geolocation', 'mordecai_host') + mordecai_port = parser.get('Geolocation', 'mordecai_port') + + + geo_attrs = namedtuple('GeolocationAttributes', ['geo_service', + 'cliff_host', + 'cliff_port', + 'mordecai_host', + 'mordecai_port' + ]) + + geo_list = geo_attrs(geo_service, cliff_host, cliff_port, + mordecai_host, mordecai_port) + # these are listed in the order generated scraper_stem = parser.get('Pipeline', 'scraper_stem') @@ -103,7 +120,7 @@ def parse_config(config_filename): oneaday_filter, log_file, auth_db, auth_user, auth_pass, db_host) - return server_list, file_list, petrarch_version + return server_list, geo_list, file_list, petrarch_version except Exception as e: print('Problem parsing config file. {}'.format(e)) From 27b3d259127591dc011580a1f1b9db94e0992e6a Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 13:15:41 -0500 Subject: [PATCH 09/24] Put config in function, move prints to logs --- pipeline.py | 28 +++++++++++++--------------- utilities.py | 1 - 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pipeline.py b/pipeline.py index 52440c1..074c386 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,21 +11,8 @@ import oneaday_filter import result_formatter import scraper_connection -#from petrarch2 import petrarch2 -server_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') -if petrarch_version == '1': - from petrarch import petrarch - print("Using original Petrarch version") -elif petrarch_version == '2': - from petrarch2 import petrarch2 as petrarch - print("Using Petrarch2") -else: - print("Invalid Petrarch version. Argument must be '1' or '2'") - - - -def main(file_details, server_details, logger_file=None, run_filter=None, +def main(file_details, server_details, petrarch_version, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. @@ -61,6 +48,16 @@ def main(file_details, server_details, logger_file=None, run_filter=None, # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') + if petrarch_version == '1': + from petrarch import petrarch + logger.info("Using original Petrarch version") + elif petrarch_version == '2': + from petrarch2 import petrarch2 as petrarch + logger.info("Using Petrarch2") + else: + logger.error("Invalid Petrarch version. Argument must be '1' or '2'") + + print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: @@ -143,7 +140,8 @@ def main(file_details, server_details, logger_file=None, run_filter=None, def run(): - main(file_details, server_details, file_details.log_file, + server_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') + main(file_details, server_details, petrarch_version, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') diff --git a/utilities.py b/utilities.py index 7a63dda..115a320 100644 --- a/utilities.py +++ b/utilities.py @@ -83,7 +83,6 @@ def parse_config(config_filename): log_file = '' petrarch_version = parser.get('Petrarch', 'petrarch_version') - print("petrarch version is {}".format(petrarch_version)) file_attrs = namedtuple('FileAttributes', ['scraper_stem', 'recordfile_stem', From c5e5353a0770a8e2f9e9655e35424611f255c97b Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 13:15:41 -0500 Subject: [PATCH 10/24] Put config in function, move prints to logs --- pipeline.py | 28 +++++++++++++--------------- utilities.py | 1 - 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pipeline.py b/pipeline.py index d8a8079..a5d7ee4 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,21 +11,8 @@ import oneaday_filter import result_formatter import scraper_connection -#from petrarch2 import petrarch2 -server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') -if petrarch_version == '1': - from petrarch import petrarch - print("Using original Petrarch version") -elif petrarch_version == '2': - from petrarch2 import petrarch2 as petrarch - print("Using Petrarch2") -else: - print("Invalid Petrarch version. Argument must be '1' or '2'") - - - -def main(file_details, geo_details, server_details, logger_file=None, run_filter=None, +def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. @@ -64,6 +51,16 @@ def main(file_details, geo_details, server_details, logger_file=None, run_filter # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') + if petrarch_version == '1': + from petrarch import petrarch + logger.info("Using original Petrarch version") + elif petrarch_version == '2': + from petrarch2 import petrarch2 as petrarch + logger.info("Using Petrarch2") + else: + logger.error("Invalid Petrarch version. Argument must be '1' or '2'") + + print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: @@ -148,7 +145,8 @@ def main(file_details, geo_details, server_details, logger_file=None, run_filter def run(): - main(file_details, geo_details, server_details, file_details.log_file, + server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') + main(file_details, geo_details, server_details, petrarch_version, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') diff --git a/utilities.py b/utilities.py index dc6e3b0..affb21e 100644 --- a/utilities.py +++ b/utilities.py @@ -100,7 +100,6 @@ def parse_config(config_filename): log_file = '' petrarch_version = parser.get('Petrarch', 'petrarch_version') - print("petrarch version is {}".format(petrarch_version)) file_attrs = namedtuple('FileAttributes', ['scraper_stem', 'recordfile_stem', From efef36fc45f799a5842b6f6d04cd23760d04966b Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 15:00:21 -0500 Subject: [PATCH 11/24] Incorporate Petrarch2 change --- pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipeline.py b/pipeline.py index a5d7ee4..7a1f440 100644 --- a/pipeline.py +++ b/pipeline.py @@ -149,6 +149,5 @@ def run(): main(file_details, geo_details, server_details, petrarch_version, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') - if __name__ == '__main__': run() From b15aa2971f2424106f21d721f2646f92da55171d Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 12:39:12 -0500 Subject: [PATCH 12/24] Add Mordecai integration --- pipeline.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 7a1f440..b0997ca 100644 --- a/pipeline.py +++ b/pipeline.py @@ -12,6 +12,7 @@ import result_formatter import scraper_connection + def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None, run_date='', version=''): """ @@ -77,10 +78,6 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) - print("process date:") - print(process_date) - print("file_details") - print(file_details) results, scraperfilename = scraper_connection.main(process_date, file_details) From a378f44eef6ee031b18f687cc8bea15dcd2f53a3 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 28 Jun 2016 16:27:34 -0500 Subject: [PATCH 13/24] Check geo service on start; reduce prints --- geolocation.py | 17 ++++++++++++----- pipeline.py | 13 +++++++++++++ postprocess.py | 2 -- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/geolocation.py b/geolocation.py index ba54118..c98a213 100644 --- a/geolocation.py +++ b/geolocation.py @@ -239,6 +239,10 @@ def query_mordecai(sentence, host, port): ---------- sentence: String. Text from which an event was coded. + host: String + Host where Mordecai is running (taken from config) + port: String + Port that Mordecai service is listening on Returns ------- lat: String. @@ -259,6 +263,14 @@ def query_mordecai(sentence, host, port): out = requests.post(dest, data=data, headers=headers) return json.loads(out.text) +def test_mordecai(sentence, host, port): + """ + Check if Mordecai service is up and responding on given host and port. + Parameters + ---------- + sentence: String. + Text from which an event was coded. + """ def mordecai(events, file_details, server_details, geo_details): """ @@ -288,17 +300,13 @@ def mordecai(events, file_details, server_details, geo_details): query_text = sents[int(sentence_id)] geo_info = query_mordecai(query_text, geo_details.mordecai_host, geo_details.mordecai_port) - print(geo_info) try: # temporary hack: take the first location: geo_info = geo_info[0] # NA is for ADM1, which mord doesn't return. See issue #2 events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placename'], "NA", geo_info['countrycode']) - print("worked") except Exception as e: - print("error") - print(e) events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA") return events @@ -325,7 +333,6 @@ def cliff(events, file_details, server_details, geo_details): file_details.auth_pass) for event in events: - print(event) event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) diff --git a/pipeline.py b/pipeline.py index b0997ca..9e11cdf 100644 --- a/pipeline.py +++ b/pipeline.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import sys import logging +import requests import datetime import dateutil import uploader @@ -80,6 +81,18 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) + if geo_details.geo_service == "Mordecai": + dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) + try: + out = requests.get(dest) + assert out.status_code == 200 + except (AssertionError, requests.exceptions.ConnectionError): + print("Mordecai geolocation service not responding. Continuing anyway...") + elif geo_details.geo_service == "CLIFF": + print("CLIFF") + else: + print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") + if scraperfilename: logger.info("Scraper file name: " + scraperfilename) diff --git a/postprocess.py b/postprocess.py index bdcca85..0ad7d29 100644 --- a/postprocess.py +++ b/postprocess.py @@ -390,8 +390,6 @@ def main(event_dict, this_date, version, file_details, server_details, geo_detai logger.info('Geolocating.') print('Geolocating') - geo_details.geo_service == "Mordecai" - print(event_dict) if geo_details.geo_service == "CLIFF": updated_events = geolocation.cliff(event_dict, file_details, server_details, geo_details) elif geo_details.geo_service == "Mordecai": From 51fb87588673d30c457caaa32a5e48e8fba894bf Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Wed, 29 Jun 2016 12:34:31 -0400 Subject: [PATCH 14/24] Add basic tests (#97) * Basic test of pipeline, Petr1 and Petr2 * Make work with pytest * Add Travis, clean up formatting. --- .travis.yml | 8 ++++ requirements.txt | 1 + tests/test_pipeline.py | 91 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 .travis.yml create mode 100644 tests/test_pipeline.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..eef5bd8 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: python +python: + - "2.7" +install: + - pip install -e . + - pip install -r requirements.txt +script: py.test +sudo: false diff --git a/requirements.txt b/requirements.txt index 8c9ff67..868a5a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ Jinja2==2.7.2 MarkupSafe==0.23 numpydoc==0.4 -e git+https://github.com/openeventdata/petrarch.git@5fe92b676e8b4fb9a1964e7da55e9c95e9a5745f#egg=petrarch-master +-e git+https://github.com/openeventdata/petrarch2 pexpect==3.2 Pygments==1.6 pymongo==2.7 diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..659ae4e --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,91 @@ +from bson.objectid import ObjectId +import datetime +from petrarch import petrarch +from petrarch2 import petrarch2 + +formatted = [{u'language': u'english', +u'title': u'6 killed in attacks in Iraqi capital Friday', +u'url': u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', +u'stanford': 1, +u'content': u'BAGHDAD: At least six people, including a soldier, were killed in a spate of attacks across Iraqi capital Baghdad on Friday. A sniper opened fire on soldiers manning a checkpoint in southern Baghdad, killing a soldier and injuring three others, police officer Nader al-Janabi told Anadolu Agency. Two civilians were killed and six others injured in a bomb blast in al-Zafarana district in south-eastern Baghdad, he said. Three more civilians were killed and seven others injured in two bomb blasts in southern and northern Baghdad, according to al-Janabi. Iraqi officials often blame the attacks on the Daesh terrorist group, which overran vast swathes of territory in Iraq in 2014. ', +u'source': u'menafn_iraq', +u'parsed_sents': [u'(ROOT (S (NP (NNP BAGHDAD)) (: :) (NP (NP (QP (IN At) (JJS least) (CD six)) (NNS people)) (, ,) (PP (VBG including) (NP (DT a) (NN soldier))) (, ,)) (VP (VBD were) (VP (VBN killed) (PP (IN in) (NP (NP (DT a) (NN spate)) (PP (IN of) (NP (NNS attacks))))) (PP (IN across) (NP (JJ Iraqi) (NN capital) (NNP Baghdad))) (PP (IN on) (NP (NNP Friday))))) (. .)))', +u'(ROOT (S (NP (DT A) (NN sniper)) (VP (VBD opened) (NP (NN fire)) (PP (IN on) (S (S (NP (NNS soldiers)) (VP (VBG manning) (NP (NP (DT a) (NN checkpoint)) (PP (IN in) (NP (JJ southern) (NNP Baghdad)))) (, ,) (S (VP (VP (VBG killing) (NP (DT a) (NN soldier))) (CC and) (VP (VBG injuring) (NP (CD three) (NNS others))))))) (, ,) (NP (NNS police) (NN officer) (NNP Nader) (NNP al-Janabi)) (VP (VBD told) (NP (NNP Anadolu) (NNP Agency))) (. .))))))', +u'(ROOT (S (S (NP (CD Two) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD six) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (DT a) (NN bomb) (NN blast)) (PP (IN in) (NP (NP (NN al-Zafarana) (NN district)) (PP (IN in) (NP (JJ south-eastern) (NNP Baghdad)))))))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))', +u'(ROOT (S (NP (CD Three) (JJR more) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD seven) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (CD two) (NN bomb) (NNS blasts)) (PP (IN in) (NP (ADJP (JJ southern) (CC and) (JJ northern)) (NNP Baghdad))))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP al-Janabi)))))) (. .)))', +u'(ROOT (S (NP (JJ Iraqi) (NNS officials)) (ADVP (RB often)) (VP (VBP blame) (NP (NP (DT the) (NNS attacks)) (PP (IN on) (NP (NP (DT the) (NNP Daesh) (JJ terrorist) (NN group)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD overran) (NP (NP (JJ vast) (NNS swathes)) (PP (IN of) (NP (NP (NN territory)) (PP (IN in) (NP (NNP Iraq)))))) (PP (IN in) (NP (CD 2014)))))))))) (. .)))'], +u'date': u'160626', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 17, 640000), +u'_id': ObjectId('57702641172ab87eb7dc98fa')}, +{u'language': u'english', +u'title': u'Soldiers, Policemen Fight Over Rice', +u'url': u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', +u'stanford': 1, +u'content': u'There was chaos at the Borno State Government House in Maiduguri, yesterday, as soldiers and policemen engaged in gun battle over rice meant for internally displaced persons. The Government House is besieged daily by thousands of internally displaced persons within Maiduguri metropolis, who choose to stay outside of the designated camps. The IDPs, who queue for hours to receive rice and other relief items, often cause gridlock around the Government House with many of them having to go back empty handed each day. The situation, however, turned violent, yesterday afternoon when the soldiers that were deployed to maintain law and order tried to benefit from the largese. The soldiers were said to have tried to force their way into the Deputy Governor\u2019s office, the place designated for the distribution, to get their vehicles filled. An attempt by the mobile policemen attached to the office to prevent the soldiers from achieving their goal led to a shootout. It was gathered that the soldiers fired several warning shots and the mobile policemen shot back in return, while also firing canisters of tear gas. Lucky Irabor, to get the furious soldiers to withdraw from the battle, which caused panic across Maiduguri. It was gathered that Irabor, the most senior military officer around, and the Commissioner of Police, Aminchi Baraya, subsequently visited the injured policeman at the hospital.', +u'source': u'nigeria_tidenews', +u'parsed_sents': [u'(ROOT (S (NP (EX There)) (VP (VBD was) (NP (NP (NN chaos)) (PP (IN at) (NP (NP (DT the) (NNP Borno) (NNP State) (NNP Government) (NNP House)) (PP (IN in) (NP (NNP Maiduguri))))) (, ,) (NP (NN yesterday)) (, ,)) (PP (IN as) (NP (NP (NNS soldiers) (CC and) (NNS policemen)) (VP (VBN engaged) (PP (IN in) (NP (NP (NN gun) (NN battle)) (PP (IN over) (NP (NP (NN rice)) (VP (VBN meant) (PP (IN for) (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)))))))))))) (. .)))', +u'(ROOT (S (NP (DT The) (NNP Government) (NNP House)) (VP (VBZ is) (VP (VBN besieged) (ADVP (RB daily)) (PP (IN by) (NP (NP (NNS thousands)) (PP (IN of) (NP (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)) (PP (IN within) (NP (NP (NNP Maiduguri) (NN metropolis)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBP choose) (S (VP (TO to) (VP (VB stay) (ADVP (IN outside) (PP (IN of) (NP (DT the) (VBN designated) (NNS camps)))))))))))))))))) (. .)))', +u'(ROOT (NP (NP (NP (DT The) (NNS IDPs)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VB queue) (SBAR (IN for) (S (NP (NNS hours)) (VP (TO to) (VP (VB receive) (NP (NP (NN rice) (CC and) (JJ other) (NN relief) (NNS items)) (, ,) (S (ADVP (RB often)) (VP (VBP cause) (NP (NN gridlock)) (PP (IN around) (S (NP (NP (DT the) (NNP Government) (NNP House)) (PP (IN with) (NP (NP (JJ many)) (PP (IN of) (NP (PRP them)))))) (VP (VBG having) (S (VP (TO to) (VP (VB go) (NP (ADJP (RB back) (JJ empty)) (NN handed)) (NP (DT each) (NN day))))))))))))))))))) (. .)))', +u'(ROOT (S (S (NP (DT The) (NN situation)) (, ,) (ADVP (RB however)) (, ,) (VP (VBD turned) (ADJP (JJ violent)))) (, ,) (NP (NP (NN yesterday) (NN afternoon)) (SBAR (WHADVP (WRB when)) (S (NP (NP (DT the) (NNS soldiers)) (SBAR (WHNP (WDT that)) (S (VP (VBD were) (VP (VBN deployed) (S (VP (TO to) (VP (VP (VB maintain) (NP (NN law))) (CC and) (VP (NN order) (VP (VBD tried) (S (VP (TO to) (VP (VB benefit) (PP (IN from) (NP (DT the) (NN largese))))))))))))))))))) (. .)))', +u"(ROOT (S (NP (DT The) (NNS soldiers)) (VP (VBD were) (VP (VBN said) (S (VP (TO to) (VP (VB have) (VP (VBN tried) (S (VP (TO to) (VP (VB force) (NP (PRP$ their) (NN way)) (PP (IN into) (NP (NP (NP (DT the) (NNP Deputy) (NNP Governor) (POS 's)) (NN office)) (, ,) (NP (NP (DT the) (NN place)) (VP (VBN designated) (PP (IN for) (NP (DT the) (NN distribution))))) (, ,)))))) (S (VP (TO to) (VP (VB get) (S (NP (PRP$ their) (NNS vehicles)) (VP (VBN filled)))))))))))) (. .)))", +u'(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN by) (NP (NP (DT the) (JJ mobile) (NNS policemen)) (VP (VBN attached) (PP (TO to) (NP (DT the) (NN office))) (S (VP (TO to) (VP (VB prevent) (NP (DT the) (NNS soldiers)) (PP (IN from) (S (VP (VBG achieving) (NP (PRP$ their) (NN goal)))))))))))) (VP (VBD led) (PP (TO to) (NP (DT a) (NN shootout)))) (. .)))', +u'(ROOT (S (NP (PRP It)) (VP (VBD was) (VP (VBN gathered) (SBAR (IN that) (S (NP (DT the) (NNS soldiers)) (VP (VBD fired) (SBAR (S (NP (NP (JJ several) (VBG warning) (NNS shots)) (CC and) (NP (DT the) (JJ mobile) (NNS policemen))) (VP (VBD shot) (ADVP (RB back)) (PP (IN in) (NP (NN return))) (, ,) (SBAR (IN while) (S (ADVP (RB also)) (VP (NN firing) (NP (NP (NNS canisters)) (PP (IN of) (S (VP (VB tear) (NP (NN gas))))))))))))))))) (. .)))'], +u'date': u'160624', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 18), +u'_id': ObjectId('57702642172ab87eb5dc98e9')}, +{ "_id" : ObjectId("57702678172ab87ec2dc9933"), +"content" : "BAGHDAD - A senior Iraqi commander said the city of Fallujah was \"fully liberated\" from Islamic State of Iraq and Syria (ISIS) militants on Sunday, after a more than monthlong military operation. Iraqi troops have entered the northwestern al-Julan neighborhood, the last area of Fallujah to remain under ISIS control, the head of the counterterrorism forces in the operation, Lt. Gen. Abdul-Wahab al-Saadi, told The Associated Press. Al-Saadi said the operation, which began in late May, \"is done and the city is fully liberated.\" The Iraqi army was backed by U.S.-led airstrikes and paramilitary troops, mostly Shiite militias. \"From the center of al-Julan neighborhood, we congratulate the Iraqi people and the commander in chief...and declare that the Fallujah fight is over,\" he told Iraqi state TV, flanked by military officers and soldiers. Some of the soldiers were shooting in the air, chanting and waving the Iraqi flag. He added that troops will start working on removing bombs from the city's streets and buildings. In a statement, the U.S. central military command overseeing the U.S.-led coalition in Iraq said: \"The Coalition continues to provide support through strikes, intelligence, and advice and assistance to the Iraqi Security Forces operating in Fallujah and will continue to do so through deliberate clearing operations.\" Prime Minister Haider al-Abadi declared victory in Fallujah over a week ago, after Iraqi forces advanced into the city center and took control of a government complex. He pledged that remaining pockets of ISIS fighters would be cleared out within hours, but fierce clashes on the city's northern and western edges persisted for days. Tens of thousands of people have fled the fighting, overwhelming camps for the displaced run by the government and aid groups. According to the U.N. refugee agency, more than 85,000 people have fled Fallujah and the surrounding area since the offensive began. The UNHCR and others have warned of dire conditions in the camps -- where temperatures are well over 40 degrees Celsius (104 F) and shelter is limited -- and have called for more funds to meet mounting needs. Fallujah, which is located about 40 miles west of Baghdad, was the first city to fall to IS, in January 2014. Fallujah was also a stronghold of Sunni insurgents following the U.S.-led invasion in 2003. More than 100 American soldiers died and hundreds more were wounded in intense, house-by-house fighting in Fallujah in 2004. ISIS extremists still control significant areas in northern and western Iraq, including the country's second-largest city, Mosul. The group declared an Islamic caliphate on the territory it holds in Iraq and Syria and at the height of its power was estimated to hold nearly a third of each country. More than 3.3 million Iraqis have fled their homes since ISIS swept across northern and western Iraq in the summer of 2014, according to U.N. figures. More than 40 percent of the displaced are from Anbar province, where Fallujah is located.", +"source" : "cbs_world", +"date" : "Sun, 26 Jun 2016 17:37:27 -0400", +"language" : "english", +"title" : "Iraq: Fallujah \"fully liberated\" after monthlong fight", +"url" : "http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/", +"date_added" : datetime.datetime(2016, 6, 26, 19, 0, 18), +"stanford" : 1, +"parsed_sents" : [ "(ROOT (S (NP (NNP BAGHDAD) (: -) (NN A) (JJ senior) (JJ Iraqi) (NN commander)) (VP (VBD said) (SBAR (S (NP (NP (DT the) (NN city)) (PP (IN of) (NP (NNP Fallujah)))) (VP (VBD was) (`` ``) (VP (ADVP (RB fully)) (VBN liberated) ('' '') (PP (IN from) (NP (NP (JJ Islamic) (NN State) (PP (IN of) (NP (NP (NNP Iraq)) (CC and) (NP (NNP Syria) (PRN (-LRB- -LRB-) (NNP ISIS) (-RRB- -RRB-)) (NNS militants))))) (PP (IN on) (NP (NNP Sunday)))))) (, ,) (PP (IN after) (NP (DT a) (ADVP (JJR more) (IN than)) (JJ monthlong) (JJ military) (NN operation))))))) (. .)))", +"(ROOT (S (S (NP (JJ Iraqi) (NNS troops)) (VP (VBP have) (VP (VBN entered) (NP (DT the) (JJ northwestern) (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (NP (NP (DT the) (JJ last) (NN area)) (PP (IN of) (NP (NNP Fallujah))) (S (VP (TO to) (VP (VB remain) (PP (IN under) (NP (NNP ISIS) (NN control))))))) (, ,) (NP (NP (DT the) (NN head)) (PP (IN of) (NP (NP (DT the) (NN counterterrorism) (NNS forces)) (PP (IN in) (NP (DT the) (NN operation)))))) (, ,) (NP (NNP Lt.) (NNP Gen.) (NNP Abdul-Wahab) (NNP al-Saadi)) (, ,)) (VP (VBD told) (NP (DT The) (NNP Associated) (NNP Press))) (. .)))", +"(ROOT (S (NP (NNP Al-Saadi)) (VP (VBD said) (SBAR (S (S (NP (NP (DT the) (NN operation)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD began) (PP (IN in) (NP (JJ late) (NNP May)))))) (, ,)) (`` ``) (VP (VBZ is) (VP (VBN done)))) (CC and) (S (NP (DT the) (NN city)) (VP (VBZ is) (ADVP (RB fully)) (VP (VBN liberated))))))) (. .) ('' '')))", +"(ROOT (S (`` ``) (S (PP (IN From) (NP (NP (DT the) (NN center)) (PP (IN of) (NP (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (PRP we)) (VP (VP (VBP congratulate) (NP (NP (DT the) (JJ Iraqi) (NNS people)) (CC and) (NP (NP (DT the) (NN commander)) (PP (IN in) (NP (NN chief)))))) (: ...) (CC and) (VP (VB declare) (SBAR (IN that) (S (NP (DT the) (NNP Fallujah) (NN fight)) (VP (VBZ is) (ADVP (IN over)))))))) (, ,) ('' '') (NP (PRP he)) (VP (VBD told) (NP (JJ Iraqi) (NN state) (NN TV)) (, ,) (S (VP (VBN flanked) (PP (IN by) (NP (JJ military) (NNS officers) (CC and) (NNS soldiers)))))) (. .)))", +"(ROOT (S (PP (IN In) (NP (DT a) (NN statement))) (, ,) (NP (NP (DT the) (NNP U.S.) (JJ central) (JJ military) (NN command)) (VP (VBG overseeing) (NP (NP (DT the) (JJ U.S.-led) (NN coalition)) (PP (IN in) (NP (NNP Iraq)))))) (VP (VBD said) (: :) (`` ``) (S (NP (DT The) (NNP Coalition)) (VP (VP (VBZ continues) (S (VP (TO to) (VP (VB provide) (NP (NN support)) (PP (IN through) (NP (NP (NP (NNS strikes)) (, ,) (NP (NN intelligence)) (, ,) (CC and) (NP (NN advice))) (CC and) (NP (NP (NN assistance)) (PP (TO to) (NP (NP (DT the) (JJ Iraqi) (NN Security) (NNS Forces)) (VP (VBG operating) (PP (IN in) (NP (NNP Fallujah))))))))))))) (CC and) (VP (MD will) (VP (VB continue) (S (VP (TO to) (VP (VB do) (ADVP (RB so))))) (PP (IN through) (NP (JJ deliberate) (NN clearing) (NNS operations)))))))) (. .) ('' '')))", +"(ROOT (S (NP (PRP He)) (VP (VP (VBD pledged) (SBAR (IN that) (S (NP (NP (VBG remaining) (NNS pockets)) (PP (IN of) (NP (NNP ISIS) (NNS fighters)))) (VP (MD would) (VP (VB be) (VP (VBN cleared) (PRT (RP out)) (PP (IN within) (NP (NNS hours))))))))) (, ,) (CC but) (S (NP (NP (JJ fierce) (NNS clashes)) (PP (IN on) (NP (NP (DT the) (NN city) (POS 's)) (ADJP (JJ northern) (CC and) (JJ western)) (NNS edges)))) (VP (VBD persisted) (PP (IN for) (NP (NNS days)))))) (. .)))", +"(ROOT (S (NP (NP (NNS Tens)) (PP (IN of) (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS people)))))) (VP (VBP have) (VP (VBN fled) (NP (NP (DT the) (NN fighting)) (, ,) (NP (NP (JJ overwhelming) (NNS camps)) (PP (IN for) (NP (DT the) (JJ displaced) (NN run)))) (PP (IN by) (NP (DT the) (NN government) (CC and) (NN aid) (NNS groups)))))) (. .)))" ] }] + +def test_petr1_formatted_to_results(): + petr1_results = petrarch.run_pipeline(formatted, write_output=False, + parsed=True) + correct1_results = {'57702678172ab87ec2dc9933': + [(u'20160626', u'IRQ', u'MED', u'010', u'57702678172ab87ec2dc9933_1', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world'), + (u'20160626', u'IRQMIL', u'IRQ', u'010', u'NAMED_TERROR_GROUP,1', + u'57702678172ab87ec2dc9933_0', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world') + ]} + assert petr1_results == correct1_results + +def test_petr2_formatted_to_results(): + petr2_results = petrarch2.run_pipeline(formatted, write_output=False, + parsed=True) + correct2_results = {'57702678172ab87ec2dc9933': + [(u'20160626', u'IRQMIL', u'MED', u'010', u'57702678172ab87ec2dc9933_1', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world'), + (u'20160626', u'IRQMIL', u'IRQ', u'010', u'NAMED_TERROR_GROUP,1', u'57702678172ab87ec2dc9933_0', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world') + ], + '57702642172ab87eb5dc98e9': + [(u'20160624', u'NGAPPL', u'---GOV', u'191', u'REFUGEES,1', u'57702642172ab87eb5dc98e9_1', + u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', + u'nigeria_tidenews')], + '57702641172ab87eb7dc98fa': + [(u'20160626', u'IRQ', u'IMGMUSISIUAF', u'111', u'TERROR,1', + u'57702641172ab87eb7dc98fa_4', + u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', + u'menafn_iraq'), + (u'20160626', u'---CVL', u'IRQ', u'190', u'57702641172ab87eb7dc98fa_3', + u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', + u'menafn_iraq')]} + assert petr2_results == correct2_results + From f0ef167dd2e1a35157c54ca09e44ffa4fb969325 Mon Sep 17 00:00:00 2001 From: John Beieler Date: Wed, 29 Jun 2016 12:36:48 -0400 Subject: [PATCH 15/24] Add in travis status badge. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6e0baaa..379ac0d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ phoenix_pipeline ================ +[![Build Status](https://travis-ci.org/openeventdata/phoenix_pipeline.svg?branch=master)](https://travis-ci.org/openeventdata/phoenix_pipeline) [![Join the chat at https://gitter.im/openeventdata/phoenix_pipeline](https://badges.gitter.im/openeventdata/phoenix_pipeline.svg)](https://gitter.im/openeventdata/phoenix_pipeline?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) Turning news into events since 2014. From c349658c9e773a76bb4ca0a60fb64e92be7852c3 Mon Sep 17 00:00:00 2001 From: John Beieler Date: Wed, 29 Jun 2016 12:38:35 -0400 Subject: [PATCH 16/24] Fix travis config. --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index eef5bd8..451f50e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: python python: - "2.7" install: - - pip install -e . - pip install -r requirements.txt script: py.test sudo: false From 751e85ce79273e80a6709c5f346837a5023808c6 Mon Sep 17 00:00:00 2001 From: John Beieler Date: Wed, 29 Jun 2016 12:41:45 -0400 Subject: [PATCH 17/24] Update reqs file. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 868a5a0..cf173b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ Jinja2==2.7.2 MarkupSafe==0.23 numpydoc==0.4 -e git+https://github.com/openeventdata/petrarch.git@5fe92b676e8b4fb9a1964e7da55e9c95e9a5745f#egg=petrarch-master --e git+https://github.com/openeventdata/petrarch2 +-e git+https://github.com/openeventdata/petrarch2.git#egg=petrarch2-master pexpect==3.2 Pygments==1.6 pymongo==2.7 From 220a0a5d97b5705c382f56ceee584ea9f4086018 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Sun, 3 Jul 2016 10:49:08 -0400 Subject: [PATCH 18/24] Confirm CLIFF still works. --- PHOX_config.ini | 2 +- geolocation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PHOX_config.ini b/PHOX_config.ini index 6807dc7..26ad5d7 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -7,7 +7,7 @@ server_dir = public_html/datasets/phoenix/ [Geolocation] geo_service = Mordecai cliff_host = http://localhost -cliff_port = 8999 +cliff_port = 8080 mordecai_host = http://localhost mordecai_port = 5011 diff --git a/geolocation.py b/geolocation.py index c98a213..906c701 100644 --- a/geolocation.py +++ b/geolocation.py @@ -34,7 +34,7 @@ def query_cliff(sentence, host, port): place_info = {'lat': '', 'lon': '', 'placeName': '', 'countryCode': '', 'stateName': '', 'restype' : ''} - cliff_address = "http://{}:{}/CLIFF-2.0.0/parse/text".format(host, port) + cliff_address = "{}:{}/CLIFF-2.0.0/parse/text".format(host, port) try: located = requests.get(cliff_address, params=payload).json() From df3e4f67f60887574887c3173ac7d3bfcec9abd3 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Wed, 29 Jun 2016 09:16:03 -0500 Subject: [PATCH 19/24] Update docs with geo and other info --- README.md | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 379ac0d..779e8d4 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,46 @@ phoenix_pipeline Turning news into events since 2014. -This system links a series of Python programs to convert the files which have been -downloaded by a [web scraper](https://github.com/openeventdata/scraper) to coded event data which is uploaded to a web site -designated in the config file. The system processes a single day of information, but this -can be derived from multiple text files. The pipeline also implements a filter for -source URLs as defined by the keys in the `source_keys.txt` file. These keys -correspond to the `source` field in the MongoDB instance. +End-to-end system for creating event data from news text. +This system links a series of Python programs to convert the files which have +been downloaded by a [web scraper](https://github.com/openeventdata/scraper) to +coded event data which is uploaded to a web site designated in the config file. +The system processes a single day of information, but this can be derived from +multiple text files. The pipeline also implements a filter for source URLs as +defined by the keys in the `source_keys.txt` file. These keys correspond to the +`source` field in the MongoDB instance. -For more information please visit the [documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). +For more information please visit the +[documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). -##Running +## Requirements + +The pipeline requires either +[Petrarch](https://github.com/openeventdata/petrarch) or +[Petrarch2](https://github.com/openeventdata/petrarch2) to be installed. Both +are Python programs and can be installed from Github using pip. +Other Python dependencies can be installed with `pip install -r requirements.txt`. + +The pipeline assumes that stories are stored in a MongoDB in a particular +format. This format is the one used by the OEDA news RSS scraper. See [the +scraper code](https://github.com/openeventdata/scraper/blob/master/mongo_connection.py) +for details on it structures stories in the Mongo. Using this pipeline with +differently formatted databases will require changing field names throughout +the pipeline code. + +The pipeline requires one of two geocoding systems to be running: CLIFF-CLAVIN +or Mordecai. For CLIFF, see a VM version +[here](https://github.com/ahalterman/CLIFF-up) or the Docker version +[here](https://github.com/caerusassociates/cliff_container). For Mordecai, see +the setup instructions on the Mordecai +[repo](https://github.com/openeventdata/mordecai). The existing Phoenix data +uses CLIFF, but future development of the pipeline will use Mordecai. Set which +geolocation system you're using and its host/port in `PHOX_config.ini`. + +## Running To run the program: - python pipeline.py + `python pipeline.py` + + From 998762ac109f9481ba7637fd65b95ad5f401599c Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 5 Jul 2016 11:48:31 -0500 Subject: [PATCH 20/24] Update docs for geolocation and requirements --- README.md | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 779e8d4..d6b138f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ phoenix_pipeline Turning news into events since 2014. -End-to-end system for creating event data from news text. This system links a series of Python programs to convert the files which have been downloaded by a [web scraper](https://github.com/openeventdata/scraper) to coded event data which is uploaded to a web site designated in the config file. @@ -15,8 +14,7 @@ multiple text files. The pipeline also implements a filter for source URLs as defined by the keys in the `source_keys.txt` file. These keys correspond to the `source` field in the MongoDB instance. -For more information please visit the -[documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). +For more information please visit the [documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). ## Requirements @@ -24,28 +22,27 @@ The pipeline requires either [Petrarch](https://github.com/openeventdata/petrarch) or [Petrarch2](https://github.com/openeventdata/petrarch2) to be installed. Both are Python programs and can be installed from Github using pip. -Other Python dependencies can be installed with `pip install -r requirements.txt`. The pipeline assumes that stories are stored in a MongoDB in a particular format. This format is the one used by the OEDA news RSS scraper. See [the -scraper code](https://github.com/openeventdata/scraper/blob/master/mongo_connection.py) +code](https://github.com/openeventdata/scraper/blob/master/mongo_connection.py) for details on it structures stories in the Mongo. Using this pipeline with differently formatted databases will require changing field names throughout -the pipeline code. +the code. The pipeline also requires that stories have been parsed with +Stanford CoreNLP. See the [simple and +stable](https://github.com/openeventdata/stanford_pipeline) way to do this, or +the [experimental distributed](https://github.com/oudalab/biryani) approach. The pipeline requires one of two geocoding systems to be running: CLIFF-CLAVIN or Mordecai. For CLIFF, see a VM version -[here](https://github.com/ahalterman/CLIFF-up) or the Docker version -[here](https://github.com/caerusassociates/cliff_container). For Mordecai, see -the setup instructions on the Mordecai -[repo](https://github.com/openeventdata/mordecai). The existing Phoenix data -uses CLIFF, but future development of the pipeline will use Mordecai. Set which -geolocation system you're using and its host/port in `PHOX_config.ini`. +[here](https://github.com/ahalterman/CLIFF-up) or a Docker container version +[here](https://github.com/openeventdata/cliff_container). For Mordecai, see the +setup instructions [here](https://github.com/openeventdata/mordecai). The +version of the pipeline deployed in production currently uses CLIFF/CLAVIN, but +future development will focus on improvements to Mordecai. -## Running +##Running To run the program: - `python pipeline.py` - - +`python pipeline.py` From 37aa16718bfc09b7043037611016e056e7c2210a Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Tue, 5 Jul 2016 13:11:43 -0500 Subject: [PATCH 21/24] Add geolocation unit test --- tests/test_geolocation.py | 73 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tests/test_geolocation.py diff --git a/tests/test_geolocation.py b/tests/test_geolocation.py new file mode 100644 index 0000000..7df7232 --- /dev/null +++ b/tests/test_geolocation.py @@ -0,0 +1,73 @@ +from bson.objectid import ObjectId +import datetime +import sys +import os +sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../") +import geolocation +import utilities + + +formatted = [{u'language': u'english', +u'title': u'6 killed in attacks in Iraqi capital Friday', +u'url': u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', +u'stanford': 1, +u'content': u'BAGHDAD: At least six people, including a soldier, were killed in a spate of attacks across Iraqi capital Baghdad on Friday. A sniper opened fire on soldiers manning a checkpoint in southern Baghdad, killing a soldier and injuring three others, police officer Nader al-Janabi told Anadolu Agency. Two civilians were killed and six others injured in a bomb blast in al-Zafarana district in south-eastern Baghdad, he said. Three more civilians were killed and seven others injured in two bomb blasts in southern and northern Baghdad, according to al-Janabi. Iraqi officials often blame the attacks on the Daesh terrorist group, which overran vast swathes of territory in Iraq in 2014. ', +u'source': u'menafn_iraq', +u'parsed_sents': [u'(ROOT (S (NP (NNP BAGHDAD)) (: :) (NP (NP (QP (IN At) (JJS least) (CD six)) (NNS people)) (, ,) (PP (VBG including) (NP (DT a) (NN soldier))) (, ,)) (VP (VBD were) (VP (VBN killed) (PP (IN in) (NP (NP (DT a) (NN spate)) (PP (IN of) (NP (NNS attacks))))) (PP (IN across) (NP (JJ Iraqi) (NN capital) (NNP Baghdad))) (PP (IN on) (NP (NNP Friday))))) (. .)))', +u'(ROOT (S (NP (DT A) (NN sniper)) (VP (VBD opened) (NP (NN fire)) (PP (IN on) (S (S (NP (NNS soldiers)) (VP (VBG manning) (NP (NP (DT a) (NN checkpoint)) (PP (IN in) (NP (JJ southern) (NNP Baghdad)))) (, ,) (S (VP (VP (VBG killing) (NP (DT a) (NN soldier))) (CC and) (VP (VBG injuring) (NP (CD three) (NNS others))))))) (, ,) (NP (NNS police) (NN officer) (NNP Nader) (NNP al-Janabi)) (VP (VBD told) (NP (NNP Anadolu) (NNP Agency))) (. .))))))', +u'(ROOT (S (S (NP (CD Two) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD six) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (DT a) (NN bomb) (NN blast)) (PP (IN in) (NP (NP (NN al-Zafarana) (NN district)) (PP (IN in) (NP (JJ south-eastern) (NNP Baghdad)))))))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))', +u'(ROOT (S (NP (CD Three) (JJR more) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD seven) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (CD two) (NN bomb) (NNS blasts)) (PP (IN in) (NP (ADJP (JJ southern) (CC and) (JJ northern)) (NNP Baghdad))))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP al-Janabi)))))) (. .)))', +u'(ROOT (S (NP (JJ Iraqi) (NNS officials)) (ADVP (RB often)) (VP (VBP blame) (NP (NP (DT the) (NNS attacks)) (PP (IN on) (NP (NP (DT the) (NNP Daesh) (JJ terrorist) (NN group)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD overran) (NP (NP (JJ vast) (NNS swathes)) (PP (IN of) (NP (NP (NN territory)) (PP (IN in) (NP (NNP Iraq)))))) (PP (IN in) (NP (CD 2014)))))))))) (. .)))'], +u'date': u'160626', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 17, 640000), +u'_id': ObjectId('57702641172ab87eb7dc98fa')}, +{u'language': u'english', +u'title': u'Soldiers, Policemen Fight Over Rice', +u'url': u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', +u'stanford': 1, +u'content': u'There was chaos at the Borno State Government House in Maiduguri, yesterday, as soldiers and policemen engaged in gun battle over rice meant for internally displaced persons. The Government House is besieged daily by thousands of internally displaced persons within Maiduguri metropolis, who choose to stay outside of the designated camps. The IDPs, who queue for hours to receive rice and other relief items, often cause gridlock around the Government House with many of them having to go back empty handed each day. The situation, however, turned violent, yesterday afternoon when the soldiers that were deployed to maintain law and order tried to benefit from the largese. The soldiers were said to have tried to force their way into the Deputy Governor\u2019s office, the place designated for the distribution, to get their vehicles filled. An attempt by the mobile policemen attached to the office to prevent the soldiers from achieving their goal led to a shootout. It was gathered that the soldiers fired several warning shots and the mobile policemen shot back in return, while also firing canisters of tear gas. Lucky Irabor, to get the furious soldiers to withdraw from the battle, which caused panic across Maiduguri. It was gathered that Irabor, the most senior military officer around, and the Commissioner of Police, Aminchi Baraya, subsequently visited the injured policeman at the hospital.', +u'source': u'nigeria_tidenews', +u'parsed_sents': [u'(ROOT (S (NP (EX There)) (VP (VBD was) (NP (NP (NN chaos)) (PP (IN at) (NP (NP (DT the) (NNP Borno) (NNP State) (NNP Government) (NNP House)) (PP (IN in) (NP (NNP Maiduguri))))) (, ,) (NP (NN yesterday)) (, ,)) (PP (IN as) (NP (NP (NNS soldiers) (CC and) (NNS policemen)) (VP (VBN engaged) (PP (IN in) (NP (NP (NN gun) (NN battle)) (PP (IN over) (NP (NP (NN rice)) (VP (VBN meant) (PP (IN for) (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)))))))))))) (. .)))', +u'(ROOT (S (NP (DT The) (NNP Government) (NNP House)) (VP (VBZ is) (VP (VBN besieged) (ADVP (RB daily)) (PP (IN by) (NP (NP (NNS thousands)) (PP (IN of) (NP (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)) (PP (IN within) (NP (NP (NNP Maiduguri) (NN metropolis)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBP choose) (S (VP (TO to) (VP (VB stay) (ADVP (IN outside) (PP (IN of) (NP (DT the) (VBN designated) (NNS camps)))))))))))))))))) (. .)))', +u'(ROOT (NP (NP (NP (DT The) (NNS IDPs)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VB queue) (SBAR (IN for) (S (NP (NNS hours)) (VP (TO to) (VP (VB receive) (NP (NP (NN rice) (CC and) (JJ other) (NN relief) (NNS items)) (, ,) (S (ADVP (RB often)) (VP (VBP cause) (NP (NN gridlock)) (PP (IN around) (S (NP (NP (DT the) (NNP Government) (NNP House)) (PP (IN with) (NP (NP (JJ many)) (PP (IN of) (NP (PRP them)))))) (VP (VBG having) (S (VP (TO to) (VP (VB go) (NP (ADJP (RB back) (JJ empty)) (NN handed)) (NP (DT each) (NN day))))))))))))))))))) (. .)))', +u'(ROOT (S (S (NP (DT The) (NN situation)) (, ,) (ADVP (RB however)) (, ,) (VP (VBD turned) (ADJP (JJ violent)))) (, ,) (NP (NP (NN yesterday) (NN afternoon)) (SBAR (WHADVP (WRB when)) (S (NP (NP (DT the) (NNS soldiers)) (SBAR (WHNP (WDT that)) (S (VP (VBD were) (VP (VBN deployed) (S (VP (TO to) (VP (VP (VB maintain) (NP (NN law))) (CC and) (VP (NN order) (VP (VBD tried) (S (VP (TO to) (VP (VB benefit) (PP (IN from) (NP (DT the) (NN largese))))))))))))))))))) (. .)))', +u"(ROOT (S (NP (DT The) (NNS soldiers)) (VP (VBD were) (VP (VBN said) (S (VP (TO to) (VP (VB have) (VP (VBN tried) (S (VP (TO to) (VP (VB force) (NP (PRP$ their) (NN way)) (PP (IN into) (NP (NP (NP (DT the) (NNP Deputy) (NNP Governor) (POS 's)) (NN office)) (, ,) (NP (NP (DT the) (NN place)) (VP (VBN designated) (PP (IN for) (NP (DT the) (NN distribution))))) (, ,)))))) (S (VP (TO to) (VP (VB get) (S (NP (PRP$ their) (NNS vehicles)) (VP (VBN filled)))))))))))) (. .)))", +u'(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN by) (NP (NP (DT the) (JJ mobile) (NNS policemen)) (VP (VBN attached) (PP (TO to) (NP (DT the) (NN office))) (S (VP (TO to) (VP (VB prevent) (NP (DT the) (NNS soldiers)) (PP (IN from) (S (VP (VBG achieving) (NP (PRP$ their) (NN goal)))))))))))) (VP (VBD led) (PP (TO to) (NP (DT a) (NN shootout)))) (. .)))', +u'(ROOT (S (NP (PRP It)) (VP (VBD was) (VP (VBN gathered) (SBAR (IN that) (S (NP (DT the) (NNS soldiers)) (VP (VBD fired) (SBAR (S (NP (NP (JJ several) (VBG warning) (NNS shots)) (CC and) (NP (DT the) (JJ mobile) (NNS policemen))) (VP (VBD shot) (ADVP (RB back)) (PP (IN in) (NP (NN return))) (, ,) (SBAR (IN while) (S (ADVP (RB also)) (VP (NN firing) (NP (NP (NNS canisters)) (PP (IN of) (S (VP (VB tear) (NP (NN gas))))))))))))))))) (. .)))'], +u'date': u'160624', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 18), +u'_id': ObjectId('57702642172ab87eb5dc98e9')}, +{ "_id" : ObjectId("57702678172ab87ec2dc9933"), +"content" : "BAGHDAD - A senior Iraqi commander said the city of Fallujah was \"fully liberated\" from Islamic State of Iraq and Syria (ISIS) militants on Sunday, after a more than monthlong military operation. Iraqi troops have entered the northwestern al-Julan neighborhood, the last area of Fallujah to remain under ISIS control, the head of the counterterrorism forces in the operation, Lt. Gen. Abdul-Wahab al-Saadi, told The Associated Press. Al-Saadi said the operation, which began in late May, \"is done and the city is fully liberated.\" The Iraqi army was backed by U.S.-led airstrikes and paramilitary troops, mostly Shiite militias. \"From the center of al-Julan neighborhood, we congratulate the Iraqi people and the commander in chief...and declare that the Fallujah fight is over,\" he told Iraqi state TV, flanked by military officers and soldiers. Some of the soldiers were shooting in the air, chanting and waving the Iraqi flag. He added that troops will start working on removing bombs from the city's streets and buildings. In a statement, the U.S. central military command overseeing the U.S.-led coalition in Iraq said: \"The Coalition continues to provide support through strikes, intelligence, and advice and assistance to the Iraqi Security Forces operating in Fallujah and will continue to do so through deliberate clearing operations.\" Prime Minister Haider al-Abadi declared victory in Fallujah over a week ago, after Iraqi forces advanced into the city center and took control of a government complex. He pledged that remaining pockets of ISIS fighters would be cleared out within hours, but fierce clashes on the city's northern and western edges persisted for days. Tens of thousands of people have fled the fighting, overwhelming camps for the displaced run by the government and aid groups. According to the U.N. refugee agency, more than 85,000 people have fled Fallujah and the surrounding area since the offensive began. The UNHCR and others have warned of dire conditions in the camps -- where temperatures are well over 40 degrees Celsius (104 F) and shelter is limited -- and have called for more funds to meet mounting needs. Fallujah, which is located about 40 miles west of Baghdad, was the first city to fall to IS, in January 2014. Fallujah was also a stronghold of Sunni insurgents following the U.S.-led invasion in 2003. More than 100 American soldiers died and hundreds more were wounded in intense, house-by-house fighting in Fallujah in 2004. ISIS extremists still control significant areas in northern and western Iraq, including the country's second-largest city, Mosul. The group declared an Islamic caliphate on the territory it holds in Iraq and Syria and at the height of its power was estimated to hold nearly a third of each country. More than 3.3 million Iraqis have fled their homes since ISIS swept across northern and western Iraq in the summer of 2014, according to U.N. figures. More than 40 percent of the displaced are from Anbar province, where Fallujah is located.", +"source" : "cbs_world", +"date" : "Sun, 26 Jun 2016 17:37:27 -0400", +"language" : "english", +"title" : "Iraq: Fallujah \"fully liberated\" after monthlong fight", +"url" : "http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/", +"date_added" : datetime.datetime(2016, 6, 26, 19, 0, 18), +"stanford" : 1, +"parsed_sents" : [ "(ROOT (S (NP (NNP BAGHDAD) (: -) (NN A) (JJ senior) (JJ Iraqi) (NN commander)) (VP (VBD said) (SBAR (S (NP (NP (DT the) (NN city)) (PP (IN of) (NP (NNP Fallujah)))) (VP (VBD was) (`` ``) (VP (ADVP (RB fully)) (VBN liberated) ('' '') (PP (IN from) (NP (NP (JJ Islamic) (NN State) (PP (IN of) (NP (NP (NNP Iraq)) (CC and) (NP (NNP Syria) (PRN (-LRB- -LRB-) (NNP ISIS) (-RRB- -RRB-)) (NNS militants))))) (PP (IN on) (NP (NNP Sunday)))))) (, ,) (PP (IN after) (NP (DT a) (ADVP (JJR more) (IN than)) (JJ monthlong) (JJ military) (NN operation))))))) (. .)))", +"(ROOT (S (S (NP (JJ Iraqi) (NNS troops)) (VP (VBP have) (VP (VBN entered) (NP (DT the) (JJ northwestern) (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (NP (NP (DT the) (JJ last) (NN area)) (PP (IN of) (NP (NNP Fallujah))) (S (VP (TO to) (VP (VB remain) (PP (IN under) (NP (NNP ISIS) (NN control))))))) (, ,) (NP (NP (DT the) (NN head)) (PP (IN of) (NP (NP (DT the) (NN counterterrorism) (NNS forces)) (PP (IN in) (NP (DT the) (NN operation)))))) (, ,) (NP (NNP Lt.) (NNP Gen.) (NNP Abdul-Wahab) (NNP al-Saadi)) (, ,)) (VP (VBD told) (NP (DT The) (NNP Associated) (NNP Press))) (. .)))", +"(ROOT (S (NP (NNP Al-Saadi)) (VP (VBD said) (SBAR (S (S (NP (NP (DT the) (NN operation)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD began) (PP (IN in) (NP (JJ late) (NNP May)))))) (, ,)) (`` ``) (VP (VBZ is) (VP (VBN done)))) (CC and) (S (NP (DT the) (NN city)) (VP (VBZ is) (ADVP (RB fully)) (VP (VBN liberated))))))) (. .) ('' '')))", +"(ROOT (S (`` ``) (S (PP (IN From) (NP (NP (DT the) (NN center)) (PP (IN of) (NP (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (PRP we)) (VP (VP (VBP congratulate) (NP (NP (DT the) (JJ Iraqi) (NNS people)) (CC and) (NP (NP (DT the) (NN commander)) (PP (IN in) (NP (NN chief)))))) (: ...) (CC and) (VP (VB declare) (SBAR (IN that) (S (NP (DT the) (NNP Fallujah) (NN fight)) (VP (VBZ is) (ADVP (IN over)))))))) (, ,) ('' '') (NP (PRP he)) (VP (VBD told) (NP (JJ Iraqi) (NN state) (NN TV)) (, ,) (S (VP (VBN flanked) (PP (IN by) (NP (JJ military) (NNS officers) (CC and) (NNS soldiers)))))) (. .)))", +"(ROOT (S (PP (IN In) (NP (DT a) (NN statement))) (, ,) (NP (NP (DT the) (NNP U.S.) (JJ central) (JJ military) (NN command)) (VP (VBG overseeing) (NP (NP (DT the) (JJ U.S.-led) (NN coalition)) (PP (IN in) (NP (NNP Iraq)))))) (VP (VBD said) (: :) (`` ``) (S (NP (DT The) (NNP Coalition)) (VP (VP (VBZ continues) (S (VP (TO to) (VP (VB provide) (NP (NN support)) (PP (IN through) (NP (NP (NP (NNS strikes)) (, ,) (NP (NN intelligence)) (, ,) (CC and) (NP (NN advice))) (CC and) (NP (NP (NN assistance)) (PP (TO to) (NP (NP (DT the) (JJ Iraqi) (NN Security) (NNS Forces)) (VP (VBG operating) (PP (IN in) (NP (NNP Fallujah))))))))))))) (CC and) (VP (MD will) (VP (VB continue) (S (VP (TO to) (VP (VB do) (ADVP (RB so))))) (PP (IN through) (NP (JJ deliberate) (NN clearing) (NNS operations)))))))) (. .) ('' '')))", +"(ROOT (S (NP (PRP He)) (VP (VP (VBD pledged) (SBAR (IN that) (S (NP (NP (VBG remaining) (NNS pockets)) (PP (IN of) (NP (NNP ISIS) (NNS fighters)))) (VP (MD would) (VP (VB be) (VP (VBN cleared) (PRT (RP out)) (PP (IN within) (NP (NNS hours))))))))) (, ,) (CC but) (S (NP (NP (JJ fierce) (NNS clashes)) (PP (IN on) (NP (NP (DT the) (NN city) (POS 's)) (ADJP (JJ northern) (CC and) (JJ western)) (NNS edges)))) (VP (VBD persisted) (PP (IN for) (NP (NNS days)))))) (. .)))", +"(ROOT (S (NP (NP (NNS Tens)) (PP (IN of) (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS people)))))) (VP (VBP have) (VP (VBN fled) (NP (NP (DT the) (NN fighting)) (, ,) (NP (NP (JJ overwhelming) (NNS camps)) (PP (IN for) (NP (DT the) (JJ displaced) (NN run)))) (PP (IN by) (NP (DT the) (NN government) (CC and) (NN aid) (NNS groups)))))) (. .)))" ] }] + + + +def test_query_mordecai(): + server_details, geo_details, file_details, petrarch_version = utilities.parse_config('../PHOX_config.ini') + mord_results = geolocation.query_mordecai(formatted[0]['content'], geo_details[3], geo_details[4]) + correct_mord_results = [{u'lat': 33.23333, u'searchterm': u'BAGHDAD', + u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': u'Baghdad'}, {u'lat': + 33.34058, u'searchterm': u'Baghdad', u'lon': 44.40088, u'countrycode': u'IRQ', + u'placename': u'Baghdad'}, {u'lat': 33.23333, u'searchterm': u'Baghdad', + u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': u'Baghdad'}, {u'lat': + 33.97683, u'searchterm': u'al-Zafarana', u'lon': 44.8901, u'countrycode': + u'IRQ', u'placename': u'Al Imam al Miqdad'}, {u'lat': 33.23333, u'searchterm': + u'Baghdad', u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': + u'Baghdad'}, {u'lat': 33.23333, u'searchterm': u'Baghdad', u'lon': 44.23333, + u'countrycode': u'IRQ', u'placename': u'Baghdad'}] + assert mord_results == correct_mord_results + From 38109c158471fe0f1347fb1c11057767c3886110 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Wed, 6 Jul 2016 17:05:37 -0500 Subject: [PATCH 22/24] Change test to not require running Mord service --- tests/test_geolocation.py | 66 ++------------------------------------- 1 file changed, 3 insertions(+), 63 deletions(-) diff --git a/tests/test_geolocation.py b/tests/test_geolocation.py index 7df7232..35525d3 100644 --- a/tests/test_geolocation.py +++ b/tests/test_geolocation.py @@ -6,68 +6,8 @@ import geolocation import utilities - -formatted = [{u'language': u'english', -u'title': u'6 killed in attacks in Iraqi capital Friday', -u'url': u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', -u'stanford': 1, -u'content': u'BAGHDAD: At least six people, including a soldier, were killed in a spate of attacks across Iraqi capital Baghdad on Friday. A sniper opened fire on soldiers manning a checkpoint in southern Baghdad, killing a soldier and injuring three others, police officer Nader al-Janabi told Anadolu Agency. Two civilians were killed and six others injured in a bomb blast in al-Zafarana district in south-eastern Baghdad, he said. Three more civilians were killed and seven others injured in two bomb blasts in southern and northern Baghdad, according to al-Janabi. Iraqi officials often blame the attacks on the Daesh terrorist group, which overran vast swathes of territory in Iraq in 2014. ', -u'source': u'menafn_iraq', -u'parsed_sents': [u'(ROOT (S (NP (NNP BAGHDAD)) (: :) (NP (NP (QP (IN At) (JJS least) (CD six)) (NNS people)) (, ,) (PP (VBG including) (NP (DT a) (NN soldier))) (, ,)) (VP (VBD were) (VP (VBN killed) (PP (IN in) (NP (NP (DT a) (NN spate)) (PP (IN of) (NP (NNS attacks))))) (PP (IN across) (NP (JJ Iraqi) (NN capital) (NNP Baghdad))) (PP (IN on) (NP (NNP Friday))))) (. .)))', -u'(ROOT (S (NP (DT A) (NN sniper)) (VP (VBD opened) (NP (NN fire)) (PP (IN on) (S (S (NP (NNS soldiers)) (VP (VBG manning) (NP (NP (DT a) (NN checkpoint)) (PP (IN in) (NP (JJ southern) (NNP Baghdad)))) (, ,) (S (VP (VP (VBG killing) (NP (DT a) (NN soldier))) (CC and) (VP (VBG injuring) (NP (CD three) (NNS others))))))) (, ,) (NP (NNS police) (NN officer) (NNP Nader) (NNP al-Janabi)) (VP (VBD told) (NP (NNP Anadolu) (NNP Agency))) (. .))))))', -u'(ROOT (S (S (NP (CD Two) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD six) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (DT a) (NN bomb) (NN blast)) (PP (IN in) (NP (NP (NN al-Zafarana) (NN district)) (PP (IN in) (NP (JJ south-eastern) (NNP Baghdad)))))))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))', -u'(ROOT (S (NP (CD Three) (JJR more) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD seven) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (CD two) (NN bomb) (NNS blasts)) (PP (IN in) (NP (ADJP (JJ southern) (CC and) (JJ northern)) (NNP Baghdad))))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP al-Janabi)))))) (. .)))', -u'(ROOT (S (NP (JJ Iraqi) (NNS officials)) (ADVP (RB often)) (VP (VBP blame) (NP (NP (DT the) (NNS attacks)) (PP (IN on) (NP (NP (DT the) (NNP Daesh) (JJ terrorist) (NN group)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD overran) (NP (NP (JJ vast) (NNS swathes)) (PP (IN of) (NP (NP (NN territory)) (PP (IN in) (NP (NNP Iraq)))))) (PP (IN in) (NP (CD 2014)))))))))) (. .)))'], -u'date': u'160626', -u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 17, 640000), -u'_id': ObjectId('57702641172ab87eb7dc98fa')}, -{u'language': u'english', -u'title': u'Soldiers, Policemen Fight Over Rice', -u'url': u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', -u'stanford': 1, -u'content': u'There was chaos at the Borno State Government House in Maiduguri, yesterday, as soldiers and policemen engaged in gun battle over rice meant for internally displaced persons. The Government House is besieged daily by thousands of internally displaced persons within Maiduguri metropolis, who choose to stay outside of the designated camps. The IDPs, who queue for hours to receive rice and other relief items, often cause gridlock around the Government House with many of them having to go back empty handed each day. The situation, however, turned violent, yesterday afternoon when the soldiers that were deployed to maintain law and order tried to benefit from the largese. The soldiers were said to have tried to force their way into the Deputy Governor\u2019s office, the place designated for the distribution, to get their vehicles filled. An attempt by the mobile policemen attached to the office to prevent the soldiers from achieving their goal led to a shootout. It was gathered that the soldiers fired several warning shots and the mobile policemen shot back in return, while also firing canisters of tear gas. Lucky Irabor, to get the furious soldiers to withdraw from the battle, which caused panic across Maiduguri. It was gathered that Irabor, the most senior military officer around, and the Commissioner of Police, Aminchi Baraya, subsequently visited the injured policeman at the hospital.', -u'source': u'nigeria_tidenews', -u'parsed_sents': [u'(ROOT (S (NP (EX There)) (VP (VBD was) (NP (NP (NN chaos)) (PP (IN at) (NP (NP (DT the) (NNP Borno) (NNP State) (NNP Government) (NNP House)) (PP (IN in) (NP (NNP Maiduguri))))) (, ,) (NP (NN yesterday)) (, ,)) (PP (IN as) (NP (NP (NNS soldiers) (CC and) (NNS policemen)) (VP (VBN engaged) (PP (IN in) (NP (NP (NN gun) (NN battle)) (PP (IN over) (NP (NP (NN rice)) (VP (VBN meant) (PP (IN for) (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)))))))))))) (. .)))', -u'(ROOT (S (NP (DT The) (NNP Government) (NNP House)) (VP (VBZ is) (VP (VBN besieged) (ADVP (RB daily)) (PP (IN by) (NP (NP (NNS thousands)) (PP (IN of) (NP (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)) (PP (IN within) (NP (NP (NNP Maiduguri) (NN metropolis)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBP choose) (S (VP (TO to) (VP (VB stay) (ADVP (IN outside) (PP (IN of) (NP (DT the) (VBN designated) (NNS camps)))))))))))))))))) (. .)))', -u'(ROOT (NP (NP (NP (DT The) (NNS IDPs)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VB queue) (SBAR (IN for) (S (NP (NNS hours)) (VP (TO to) (VP (VB receive) (NP (NP (NN rice) (CC and) (JJ other) (NN relief) (NNS items)) (, ,) (S (ADVP (RB often)) (VP (VBP cause) (NP (NN gridlock)) (PP (IN around) (S (NP (NP (DT the) (NNP Government) (NNP House)) (PP (IN with) (NP (NP (JJ many)) (PP (IN of) (NP (PRP them)))))) (VP (VBG having) (S (VP (TO to) (VP (VB go) (NP (ADJP (RB back) (JJ empty)) (NN handed)) (NP (DT each) (NN day))))))))))))))))))) (. .)))', -u'(ROOT (S (S (NP (DT The) (NN situation)) (, ,) (ADVP (RB however)) (, ,) (VP (VBD turned) (ADJP (JJ violent)))) (, ,) (NP (NP (NN yesterday) (NN afternoon)) (SBAR (WHADVP (WRB when)) (S (NP (NP (DT the) (NNS soldiers)) (SBAR (WHNP (WDT that)) (S (VP (VBD were) (VP (VBN deployed) (S (VP (TO to) (VP (VP (VB maintain) (NP (NN law))) (CC and) (VP (NN order) (VP (VBD tried) (S (VP (TO to) (VP (VB benefit) (PP (IN from) (NP (DT the) (NN largese))))))))))))))))))) (. .)))', -u"(ROOT (S (NP (DT The) (NNS soldiers)) (VP (VBD were) (VP (VBN said) (S (VP (TO to) (VP (VB have) (VP (VBN tried) (S (VP (TO to) (VP (VB force) (NP (PRP$ their) (NN way)) (PP (IN into) (NP (NP (NP (DT the) (NNP Deputy) (NNP Governor) (POS 's)) (NN office)) (, ,) (NP (NP (DT the) (NN place)) (VP (VBN designated) (PP (IN for) (NP (DT the) (NN distribution))))) (, ,)))))) (S (VP (TO to) (VP (VB get) (S (NP (PRP$ their) (NNS vehicles)) (VP (VBN filled)))))))))))) (. .)))", -u'(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN by) (NP (NP (DT the) (JJ mobile) (NNS policemen)) (VP (VBN attached) (PP (TO to) (NP (DT the) (NN office))) (S (VP (TO to) (VP (VB prevent) (NP (DT the) (NNS soldiers)) (PP (IN from) (S (VP (VBG achieving) (NP (PRP$ their) (NN goal)))))))))))) (VP (VBD led) (PP (TO to) (NP (DT a) (NN shootout)))) (. .)))', -u'(ROOT (S (NP (PRP It)) (VP (VBD was) (VP (VBN gathered) (SBAR (IN that) (S (NP (DT the) (NNS soldiers)) (VP (VBD fired) (SBAR (S (NP (NP (JJ several) (VBG warning) (NNS shots)) (CC and) (NP (DT the) (JJ mobile) (NNS policemen))) (VP (VBD shot) (ADVP (RB back)) (PP (IN in) (NP (NN return))) (, ,) (SBAR (IN while) (S (ADVP (RB also)) (VP (NN firing) (NP (NP (NNS canisters)) (PP (IN of) (S (VP (VB tear) (NP (NN gas))))))))))))))))) (. .)))'], -u'date': u'160624', -u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 18), -u'_id': ObjectId('57702642172ab87eb5dc98e9')}, -{ "_id" : ObjectId("57702678172ab87ec2dc9933"), -"content" : "BAGHDAD - A senior Iraqi commander said the city of Fallujah was \"fully liberated\" from Islamic State of Iraq and Syria (ISIS) militants on Sunday, after a more than monthlong military operation. Iraqi troops have entered the northwestern al-Julan neighborhood, the last area of Fallujah to remain under ISIS control, the head of the counterterrorism forces in the operation, Lt. Gen. Abdul-Wahab al-Saadi, told The Associated Press. Al-Saadi said the operation, which began in late May, \"is done and the city is fully liberated.\" The Iraqi army was backed by U.S.-led airstrikes and paramilitary troops, mostly Shiite militias. \"From the center of al-Julan neighborhood, we congratulate the Iraqi people and the commander in chief...and declare that the Fallujah fight is over,\" he told Iraqi state TV, flanked by military officers and soldiers. Some of the soldiers were shooting in the air, chanting and waving the Iraqi flag. He added that troops will start working on removing bombs from the city's streets and buildings. In a statement, the U.S. central military command overseeing the U.S.-led coalition in Iraq said: \"The Coalition continues to provide support through strikes, intelligence, and advice and assistance to the Iraqi Security Forces operating in Fallujah and will continue to do so through deliberate clearing operations.\" Prime Minister Haider al-Abadi declared victory in Fallujah over a week ago, after Iraqi forces advanced into the city center and took control of a government complex. He pledged that remaining pockets of ISIS fighters would be cleared out within hours, but fierce clashes on the city's northern and western edges persisted for days. Tens of thousands of people have fled the fighting, overwhelming camps for the displaced run by the government and aid groups. According to the U.N. refugee agency, more than 85,000 people have fled Fallujah and the surrounding area since the offensive began. The UNHCR and others have warned of dire conditions in the camps -- where temperatures are well over 40 degrees Celsius (104 F) and shelter is limited -- and have called for more funds to meet mounting needs. Fallujah, which is located about 40 miles west of Baghdad, was the first city to fall to IS, in January 2014. Fallujah was also a stronghold of Sunni insurgents following the U.S.-led invasion in 2003. More than 100 American soldiers died and hundreds more were wounded in intense, house-by-house fighting in Fallujah in 2004. ISIS extremists still control significant areas in northern and western Iraq, including the country's second-largest city, Mosul. The group declared an Islamic caliphate on the territory it holds in Iraq and Syria and at the height of its power was estimated to hold nearly a third of each country. More than 3.3 million Iraqis have fled their homes since ISIS swept across northern and western Iraq in the summer of 2014, according to U.N. figures. More than 40 percent of the displaced are from Anbar province, where Fallujah is located.", -"source" : "cbs_world", -"date" : "Sun, 26 Jun 2016 17:37:27 -0400", -"language" : "english", -"title" : "Iraq: Fallujah \"fully liberated\" after monthlong fight", -"url" : "http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/", -"date_added" : datetime.datetime(2016, 6, 26, 19, 0, 18), -"stanford" : 1, -"parsed_sents" : [ "(ROOT (S (NP (NNP BAGHDAD) (: -) (NN A) (JJ senior) (JJ Iraqi) (NN commander)) (VP (VBD said) (SBAR (S (NP (NP (DT the) (NN city)) (PP (IN of) (NP (NNP Fallujah)))) (VP (VBD was) (`` ``) (VP (ADVP (RB fully)) (VBN liberated) ('' '') (PP (IN from) (NP (NP (JJ Islamic) (NN State) (PP (IN of) (NP (NP (NNP Iraq)) (CC and) (NP (NNP Syria) (PRN (-LRB- -LRB-) (NNP ISIS) (-RRB- -RRB-)) (NNS militants))))) (PP (IN on) (NP (NNP Sunday)))))) (, ,) (PP (IN after) (NP (DT a) (ADVP (JJR more) (IN than)) (JJ monthlong) (JJ military) (NN operation))))))) (. .)))", -"(ROOT (S (S (NP (JJ Iraqi) (NNS troops)) (VP (VBP have) (VP (VBN entered) (NP (DT the) (JJ northwestern) (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (NP (NP (DT the) (JJ last) (NN area)) (PP (IN of) (NP (NNP Fallujah))) (S (VP (TO to) (VP (VB remain) (PP (IN under) (NP (NNP ISIS) (NN control))))))) (, ,) (NP (NP (DT the) (NN head)) (PP (IN of) (NP (NP (DT the) (NN counterterrorism) (NNS forces)) (PP (IN in) (NP (DT the) (NN operation)))))) (, ,) (NP (NNP Lt.) (NNP Gen.) (NNP Abdul-Wahab) (NNP al-Saadi)) (, ,)) (VP (VBD told) (NP (DT The) (NNP Associated) (NNP Press))) (. .)))", -"(ROOT (S (NP (NNP Al-Saadi)) (VP (VBD said) (SBAR (S (S (NP (NP (DT the) (NN operation)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD began) (PP (IN in) (NP (JJ late) (NNP May)))))) (, ,)) (`` ``) (VP (VBZ is) (VP (VBN done)))) (CC and) (S (NP (DT the) (NN city)) (VP (VBZ is) (ADVP (RB fully)) (VP (VBN liberated))))))) (. .) ('' '')))", -"(ROOT (S (`` ``) (S (PP (IN From) (NP (NP (DT the) (NN center)) (PP (IN of) (NP (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (PRP we)) (VP (VP (VBP congratulate) (NP (NP (DT the) (JJ Iraqi) (NNS people)) (CC and) (NP (NP (DT the) (NN commander)) (PP (IN in) (NP (NN chief)))))) (: ...) (CC and) (VP (VB declare) (SBAR (IN that) (S (NP (DT the) (NNP Fallujah) (NN fight)) (VP (VBZ is) (ADVP (IN over)))))))) (, ,) ('' '') (NP (PRP he)) (VP (VBD told) (NP (JJ Iraqi) (NN state) (NN TV)) (, ,) (S (VP (VBN flanked) (PP (IN by) (NP (JJ military) (NNS officers) (CC and) (NNS soldiers)))))) (. .)))", -"(ROOT (S (PP (IN In) (NP (DT a) (NN statement))) (, ,) (NP (NP (DT the) (NNP U.S.) (JJ central) (JJ military) (NN command)) (VP (VBG overseeing) (NP (NP (DT the) (JJ U.S.-led) (NN coalition)) (PP (IN in) (NP (NNP Iraq)))))) (VP (VBD said) (: :) (`` ``) (S (NP (DT The) (NNP Coalition)) (VP (VP (VBZ continues) (S (VP (TO to) (VP (VB provide) (NP (NN support)) (PP (IN through) (NP (NP (NP (NNS strikes)) (, ,) (NP (NN intelligence)) (, ,) (CC and) (NP (NN advice))) (CC and) (NP (NP (NN assistance)) (PP (TO to) (NP (NP (DT the) (JJ Iraqi) (NN Security) (NNS Forces)) (VP (VBG operating) (PP (IN in) (NP (NNP Fallujah))))))))))))) (CC and) (VP (MD will) (VP (VB continue) (S (VP (TO to) (VP (VB do) (ADVP (RB so))))) (PP (IN through) (NP (JJ deliberate) (NN clearing) (NNS operations)))))))) (. .) ('' '')))", -"(ROOT (S (NP (PRP He)) (VP (VP (VBD pledged) (SBAR (IN that) (S (NP (NP (VBG remaining) (NNS pockets)) (PP (IN of) (NP (NNP ISIS) (NNS fighters)))) (VP (MD would) (VP (VB be) (VP (VBN cleared) (PRT (RP out)) (PP (IN within) (NP (NNS hours))))))))) (, ,) (CC but) (S (NP (NP (JJ fierce) (NNS clashes)) (PP (IN on) (NP (NP (DT the) (NN city) (POS 's)) (ADJP (JJ northern) (CC and) (JJ western)) (NNS edges)))) (VP (VBD persisted) (PP (IN for) (NP (NNS days)))))) (. .)))", -"(ROOT (S (NP (NP (NNS Tens)) (PP (IN of) (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS people)))))) (VP (VBP have) (VP (VBN fled) (NP (NP (DT the) (NN fighting)) (, ,) (NP (NP (JJ overwhelming) (NNS camps)) (PP (IN for) (NP (DT the) (JJ displaced) (NN run)))) (PP (IN by) (NP (DT the) (NN government) (CC and) (NN aid) (NNS groups)))))) (. .)))" ] }] - - - -def test_query_mordecai(): +def test_geo_config(): server_details, geo_details, file_details, petrarch_version = utilities.parse_config('../PHOX_config.ini') - mord_results = geolocation.query_mordecai(formatted[0]['content'], geo_details[3], geo_details[4]) - correct_mord_results = [{u'lat': 33.23333, u'searchterm': u'BAGHDAD', - u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': u'Baghdad'}, {u'lat': - 33.34058, u'searchterm': u'Baghdad', u'lon': 44.40088, u'countrycode': u'IRQ', - u'placename': u'Baghdad'}, {u'lat': 33.23333, u'searchterm': u'Baghdad', - u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': u'Baghdad'}, {u'lat': - 33.97683, u'searchterm': u'al-Zafarana', u'lon': 44.8901, u'countrycode': - u'IRQ', u'placename': u'Al Imam al Miqdad'}, {u'lat': 33.23333, u'searchterm': - u'Baghdad', u'lon': 44.23333, u'countrycode': u'IRQ', u'placename': - u'Baghdad'}, {u'lat': 33.23333, u'searchterm': u'Baghdad', u'lon': 44.23333, - u'countrycode': u'IRQ', u'placename': u'Baghdad'}] - assert mord_results == correct_mord_results + geo_keys = geo_details._asdict().keys() + assert geo_keys == ['geo_service', 'cliff_host', 'cliff_port', 'mordecai_host', 'mordecai_port'] From 41951852b9356fee85b702d2652d948f8980eff6 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Wed, 6 Jul 2016 17:15:18 -0500 Subject: [PATCH 23/24] Fix path for Travis/py.test --- tests/test_geolocation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_geolocation.py b/tests/test_geolocation.py index 35525d3..54877d8 100644 --- a/tests/test_geolocation.py +++ b/tests/test_geolocation.py @@ -7,7 +7,7 @@ import utilities def test_geo_config(): - server_details, geo_details, file_details, petrarch_version = utilities.parse_config('../PHOX_config.ini') + server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') geo_keys = geo_details._asdict().keys() assert geo_keys == ['geo_service', 'cliff_host', 'cliff_port', 'mordecai_host', 'mordecai_port'] From d1639c59a3592a52706d180f0ea26ef407d9e5b7 Mon Sep 17 00:00:00 2001 From: Andy Halterman Date: Mon, 1 May 2017 10:54:59 -0400 Subject: [PATCH 24/24] More configs (#102) * Add Petr2 config file * Use config file for Mongo details * Add Petr2 config file * Forgot about the scraper conns here --- PHOX_config.ini | 8 +++- README.md | 24 ++++++++-- geolocation.py | 6 ++- petr_config.ini | 106 ++++++++++++++++++++++++++++++++++++++++++ pipeline.py | 9 ++-- scraper_connection.py | 4 +- utilities.py | 23 +++++++-- 7 files changed, 163 insertions(+), 17 deletions(-) create mode 100644 petr_config.ini diff --git a/PHOX_config.ini b/PHOX_config.ini index 26ad5d7..3c38e25 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -9,7 +9,7 @@ geo_service = Mordecai cliff_host = http://localhost cliff_port = 8080 mordecai_host = http://localhost -mordecai_port = 5011 +mordecai_port = 5000 [Pipeline] scraper_stem = scraper_results_ @@ -20,11 +20,15 @@ dupfile_stem = Phoenix.dupindex. outputfile_stem = Phoenix.events.20 newsourcestem = newsources. -oneaday_filter = True +oneaday_filter = False [Petrarch] petrarch_version = 2 +[Mongo] +db = event_scrape +collection = stories + #[Logging] #log_file = /root/logs/pipeline.log diff --git a/README.md b/README.md index d6b138f..3d52aeb 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,9 @@ phoenix_pipeline Turning news into events since 2014. -This system links a series of Python programs to convert the files which have -been downloaded by a [web scraper](https://github.com/openeventdata/scraper) to + +This system links a series of Python programs to convert the files which have been +downloaded by a [web scraper](https://github.com/openeventdata/scraper) to coded event data which is uploaded to a web site designated in the config file. The system processes a single day of information, but this can be derived from multiple text files. The pipeline also implements a filter for source URLs as @@ -16,6 +17,7 @@ defined by the keys in the `source_keys.txt` file. These keys correspond to the For more information please visit the [documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). + ## Requirements The pipeline requires either @@ -41,8 +43,22 @@ setup instructions [here](https://github.com/openeventdata/mordecai). The version of the pipeline deployed in production currently uses CLIFF/CLAVIN, but future development will focus on improvements to Mordecai. -##Running +## Configuration + +The pipeline has two configuration files. `PHOX_config.ini` specifies which +geolocation system to use, how to name the files produced by the pipeline, and +how to upload the files to a remote server if desired. + +`petr_config.ini` is the configuration file for Petrarch2 itself, including the +location of dictionaries, new actor extraction options, and the one-a-day filter. For +more details see the main [Petrarch2 repo](https://github.com/openeventdata/petrarch2/). + +## Running To run the program: -`python pipeline.py` +``` +python pipeline.py +``` + + diff --git a/geolocation.py b/geolocation.py index 906c701..1e5e94e 100644 --- a/geolocation.py +++ b/geolocation.py @@ -288,7 +288,8 @@ def mordecai(events, file_details, server_details, geo_details): Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ - coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, + coll = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: @@ -329,7 +330,8 @@ def cliff(events, file_details, server_details, geo_details): Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ - coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, + coll = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: diff --git a/petr_config.ini b/petr_config.ini new file mode 100644 index 0000000..9ad6cd5 --- /dev/null +++ b/petr_config.ini @@ -0,0 +1,106 @@ +# Configuration file for release version of PETRARCH event coder +# Codes the GigaWord.sample.PETR.txt using current dictionaries and default options +# Last update: 30 April 2015 + +[Dictionaries] +# See the PETRreader.py file for the purpose and format of these files +verbfile_name = CAMEO.2.0.txt +actorfile_list = Phoenix.Countries.actors.txt, Phoenix.International.actors.txt, Phoenix.MilNonState.actors.txt +agentfile_name = Phoenix.agents.txt +discardfile_name = Phoenix.discards.txt +issuefile_name = Phoenix.IssueCoding.txt + + + + +[Options] +# textfile_list is a comma-delimited list of text files to code. This list has priority if +# both a textfile_list and textfile_name are present +textfile_list = data/text/GigaWord.sample.PETR.xml +#textfile_list = AFP0808-01.xml, AFP0909-01.xml, AFP1210-01.xml +# textfile_name is the name of a file containing a list of names of files to code, one +# file name per line. +#textfile_name = PETR.textfiles.benchmark.txt + +# eventfile_name is the output file for the events +eventfile_name = events.PETR-Demo.txt + + +# INTERFACE OPTIONS: uncomment to activate +# Default: set all of these false, which is equivalent to an A)utocode in TABARI + +# code_by_sentence: show events after each sentence has been coded; default is to +# show events after all of the sentences in a story have been coded +code_by_sentence = True +# pause_by_sentence: pause after the coding of each sentence. Entering 'Return' will +# cause the next sentence to be coded; entering any character will +# cause the program to exit. Default is to code without any pausing. +#pause_by_sentence = True +# pause_by_story: pause after the coding of each story. +#pause_by_story = True + + +# CODING OPTIONS: +# Defaults are more or less equivalent to TABARI + +# write_actor_root: If True, the event record will include the text of the actor root: +# The root is the text at the head of the actor synonym set in the +# dictionary. Default is False +write_actor_root = False + +# write_actor_text: If True, the event record will include include the complete text of +# the noun phrase that was used to identify the actor. Default is False +write_actor_text = True + +# write_event_text: If True, the event record will include include the complete text of +# the verb phrase that was used to identify the event. Default is False +write_event_text = True + +# NULL CODING OPTIONS +# null_verbs: If True, only get verb phrases that are not in the dictionary but are associated +# with coded noun phrases +null_verbs = False + +# null_actors: If True, only get actor phrases that are not in the dictionary but associated with +# coded verb phrases. This also requires new_actor_length to be set to a value > 0: +# typically a value of 4 to 8 will give good results. +null_actors = False + +# new_actor_length: Maximum length for new actors extracted from noun phrases if no +# actor or agent generating a code is found. To disable and just +# use null codes "---", set to zero; this is the default. +# Setting this to a large number will extract anything found in a (NP +# noun phrase, though usually true actors contain a small number of words +# This must be an integer. +new_actor_length = 0 + +# require_dyad: Events require a non-null source and target: setting this false is likely +# to result in a very large number of nonsense events. As happened with the +# infamous GDELT data set of 2013-2014. And certainly no one wants to see +# that again. So the default is True +require_dyad = False + +# stop_on_error: If True, parsing errors causing the program to halt; typically used for +# debugging. With the default [false], the error is written to the error +# file, record is skipped, and processing continues. +stop_on_error = False + +# commas: These adjust the length (in words) of comma-delimited clauses that are eliminated +# from the parse. To deactivate, set the max to zero. +# Defaults, based on TABARI, are in () +# comma_min : internal clause minimum length [2] +# comma_max : internal clause maximum length [8] +# comma_bmin : initial ("begin") clause minimum length [0] +# comma_bmax : initial clause maximum length [0 : deactivated by default] +# comma_emin : terminal ("end") clause minimum length [2] +# comma_emax : terminal clause maximum length [8] +comma_min = 2 +comma_max = 8 +comma_bmin = 0 +comma_bmax = 0 +comma_emin = 2 +comma_emax = 8 + +[StanfordNLP] +stanford_dir = ~/stanford-corenlp/ + diff --git a/pipeline.py b/pipeline.py index 9e11cdf..60b65a9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -14,7 +14,7 @@ import scraper_connection -def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None, +def main(file_details, geo_details, server_details, petrarch_version, mongo_details, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. @@ -32,6 +32,9 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil Config information specifically related to the remote server for FTP uploading. + petrarch_version: String. + Which version of Petrarch to use. Must be '1' or '2' + logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working @@ -111,12 +114,12 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) - petr_results = petrarch.run_pipeline(formatted, write_output=False, + petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') - petr_results = petrarch.run_pipeline(formatted, write_output=False, + petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix diff --git a/scraper_connection.py b/scraper_connection.py index f599e04..a95e14f 100644 --- a/scraper_connection.py +++ b/scraper_connection.py @@ -74,6 +74,7 @@ def query_all(collection, lt_date, gt_date, sources, write_file=False): posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) + #posts = collection.find() print('Total number of stories: {}'.format(posts.count())) logger.info('Total number of stories: {}'.format(posts.count())) @@ -145,7 +146,8 @@ def main(current_date, file_details, write_file=False, file_stem=None): """ sources = _get_sources('source_keys.txt') - conn = utilities.make_conn(file_details.auth_db, file_details.auth_user, + conn = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, file_details.auth_pass, file_details.db_host) less_than = datetime.datetime(current_date.year, current_date.month, diff --git a/utilities.py b/utilities.py index affb21e..a0b1fc1 100644 --- a/utilities.py +++ b/utilities.py @@ -101,6 +101,15 @@ def parse_config(config_filename): petrarch_version = parser.get('Petrarch', 'petrarch_version') + if 'Mongo' in parser.sections(): + db_db = parser.get('Mongo', 'db') + db_collection = parser.get('Mongo', 'collection') + else: + db_db = 'event_scrape' + db_collection = 'stories' + + + file_attrs = namedtuple('FileAttributes', ['scraper_stem', 'recordfile_stem', 'fullfile_stem', @@ -112,12 +121,16 @@ def parse_config(config_filename): 'auth_db', 'auth_user', 'auth_pass', - 'db_host']) + 'db_host', + 'db_db', + 'db_collection']) file_list = file_attrs(scraper_stem, recordfile_stem, fullfile_stem, eventfile_stem, dupfile_stem, outputfile_stem, oneaday_filter, log_file, auth_db, auth_user, - auth_pass, db_host) + auth_pass, db_host, db_db, db_collection) + + return server_list, geo_list, file_list, petrarch_version except Exception as e: @@ -161,7 +174,7 @@ def do_RuntimeError(st1, filename='', st2=''): raise RuntimeError(st1 + ' ' + filename + ' ' + st2) -def make_conn(db_auth, db_user, db_pass, db_host=None): +def make_conn(db_db, db_collection, db_auth, db_user, db_pass, db_host=None): """ Function to establish a connection to a local MonoDB instance. @@ -192,8 +205,8 @@ def make_conn(db_auth, db_user, db_pass, db_host=None): client = MongoClient() if db_auth: client[db_auth].authenticate(db_user, db_pass) - database = client.event_scrape - collection = database['stories'] + database = client[db_db] + collection = database[db_collection] return collection