diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..451f50e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,7 @@ +language: python +python: + - "2.7" +install: + - pip install -r requirements.txt +script: py.test +sudo: false diff --git a/PHOX_config.ini b/PHOX_config.ini index 1d7fd0c..3c38e25 100644 --- a/PHOX_config.ini +++ b/PHOX_config.ini @@ -3,8 +3,13 @@ server_name = username = password = server_dir = public_html/datasets/phoenix/ -cliff_host = localhost -cliff_port = 8999 + +[Geolocation] +geo_service = Mordecai +cliff_host = http://localhost +cliff_port = 8080 +mordecai_host = http://localhost +mordecai_port = 5000 [Pipeline] scraper_stem = scraper_results_ @@ -15,7 +20,14 @@ dupfile_stem = Phoenix.dupindex. outputfile_stem = Phoenix.events.20 newsourcestem = newsources. -oneaday_filter = True +oneaday_filter = False + +[Petrarch] +petrarch_version = 2 + +[Mongo] +db = event_scrape +collection = stories #[Logging] #log_file = /root/logs/pipeline.log @@ -24,3 +36,4 @@ oneaday_filter = True #auth_db = db_name #auth_user = username #auth_pass = password +#db_host = 127.0.0.1 diff --git a/README.md b/README.md index 2659250..3d52aeb 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,64 @@ phoenix_pipeline ================ +[![Build Status](https://travis-ci.org/openeventdata/phoenix_pipeline.svg?branch=master)](https://travis-ci.org/openeventdata/phoenix_pipeline) +[![Join the chat at https://gitter.im/openeventdata/phoenix_pipeline](https://badges.gitter.im/openeventdata/phoenix_pipeline.svg)](https://gitter.im/openeventdata/phoenix_pipeline?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + Turning news into events since 2014. + This system links a series of Python programs to convert the files which have been -downloaded by a [web scraper](https://github.com/openeventdata/scraper) to coded event data which is uploaded to a web site -designated in the config file. The system processes a single day of information, but this -can be derived from multiple text files. The pipeline also implements a filter for -source URLs as defined by the keys in the `source_keys.txt` file. These keys -correspond to the `source` field in the MongoDB instance. +downloaded by a [web scraper](https://github.com/openeventdata/scraper) to +coded event data which is uploaded to a web site designated in the config file. +The system processes a single day of information, but this can be derived from +multiple text files. The pipeline also implements a filter for source URLs as +defined by the keys in the `source_keys.txt` file. These keys correspond to the +`source` field in the MongoDB instance. For more information please visit the [documentation](http://phoenix-pipeline.readthedocs.org/en/latest/). -##Running + +## Requirements + +The pipeline requires either +[Petrarch](https://github.com/openeventdata/petrarch) or +[Petrarch2](https://github.com/openeventdata/petrarch2) to be installed. Both +are Python programs and can be installed from Github using pip. + +The pipeline assumes that stories are stored in a MongoDB in a particular +format. This format is the one used by the OEDA news RSS scraper. See [the +code](https://github.com/openeventdata/scraper/blob/master/mongo_connection.py) +for details on it structures stories in the Mongo. Using this pipeline with +differently formatted databases will require changing field names throughout +the code. The pipeline also requires that stories have been parsed with +Stanford CoreNLP. See the [simple and +stable](https://github.com/openeventdata/stanford_pipeline) way to do this, or +the [experimental distributed](https://github.com/oudalab/biryani) approach. + +The pipeline requires one of two geocoding systems to be running: CLIFF-CLAVIN +or Mordecai. For CLIFF, see a VM version +[here](https://github.com/ahalterman/CLIFF-up) or a Docker container version +[here](https://github.com/openeventdata/cliff_container). For Mordecai, see the +setup instructions [here](https://github.com/openeventdata/mordecai). The +version of the pipeline deployed in production currently uses CLIFF/CLAVIN, but +future development will focus on improvements to Mordecai. + +## Configuration + +The pipeline has two configuration files. `PHOX_config.ini` specifies which +geolocation system to use, how to name the files produced by the pipeline, and +how to upload the files to a remote server if desired. + +`petr_config.ini` is the configuration file for Petrarch2 itself, including the +location of dictionaries, new actor extraction options, and the one-a-day filter. For +more details see the main [Petrarch2 repo](https://github.com/openeventdata/petrarch2/). + +## Running To run the program: - python pipeline.py +``` +python pipeline.py +``` + + diff --git a/geolocation.py b/geolocation.py index 4a28312..1e5e94e 100644 --- a/geolocation.py +++ b/geolocation.py @@ -4,6 +4,7 @@ import requests import utilities from bson.objectid import ObjectId +import json def query_cliff(sentence, host, port): """ @@ -33,7 +34,7 @@ def query_cliff(sentence, host, port): place_info = {'lat': '', 'lon': '', 'placeName': '', 'countryCode': '', 'stateName': '', 'restype' : ''} - cliff_address = "http://{}:{}/CLIFF-2.0.0/parse/text".format(host, port) + cliff_address = "{}:{}/CLIFF-2.0.0/parse/text".format(host, port) try: located = requests.get(cliff_address, params=payload).json() @@ -230,12 +231,93 @@ def iso_convert(iso2c): iso3c = "NA" return iso3c +def query_mordecai(sentence, host, port): + """ + Takes a sentence from a news article, passes it to the Mordecai geolocation + service, and extracts the relevant data that Mordecai returns. + Parameters + ---------- + sentence: String. + Text from which an event was coded. + host: String + Host where Mordecai is running (taken from config) + port: String + Port that Mordecai service is listening on + Returns + ------- + lat: String. + Latitude of a location. + lon: String. + Longitude of a location. + placeName: String. + The name of the most precise location extracted from the sentence. + stateName: String. + The name of the state/region/province extracted from the sentence. + countryCode: String. + The ISO 3 character country code of the country extracted from the sentence. + """ + headers = {'Content-Type': 'application/json'} + data = {'text': sentence} + data = json.dumps(data) + dest = "{0}:{1}/places".format(host, port) + out = requests.post(dest, data=data, headers=headers) + return json.loads(out.text) + +def test_mordecai(sentence, host, port): + """ + Check if Mordecai service is up and responding on given host and port. + Parameters + ---------- + sentence: String. + Text from which an event was coded. + """ + +def mordecai(events, file_details, server_details, geo_details): + """ + Pulls out a database ID and queries the Mordecai geolocation system + running locally and find location information within the sentence. + Parameters + ---------- + events: Dictionary. + Contains filtered events from the one-a-day filter. Keys are + (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of + IDs, sources, and issues. + Returns + ------- + events: Dictionary. + Same as in the parameter but with the addition of a value that is + a list of lon, lat, placeName, stateName, countryCode. + """ + coll = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, + file_details.auth_pass) + + for event in events: + event_id, sentence_id = events[event]['ids'][0].split('_') + # print(event_id) + result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) + sents = utilities.sentence_segmenter(result['content']) + + query_text = sents[int(sentence_id)] + geo_info = query_mordecai(query_text, geo_details.mordecai_host, + geo_details.mordecai_port) + try: + # temporary hack: take the first location: + geo_info = geo_info[0] + # NA is for ADM1, which mord doesn't return. See issue #2 + events[event]['geo'] = (geo_info['lon'], geo_info['lat'], + geo_info['placename'], "NA", geo_info['countrycode']) + except Exception as e: + events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA") + + return events -def main(events, file_details, server_details): +def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's - CLIFF/CLAVIN geolocation system running locally and find location - information within the sentence. + CLIFF/CLAVIN geolocation system running locally and find location + information within the sentence. Note, this function calls back to the database + where stories are stored. Parameters ---------- events: Dictionary. @@ -248,7 +330,8 @@ def main(events, file_details, server_details): Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ - coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, + coll = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: @@ -258,8 +341,8 @@ def main(events, file_details, server_details): sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] - geo_info = query_cliff(query_text, server_details.cliff_host, - server_details.cliff_port) + geo_info = query_cliff(query_text, geo_details.cliff_host, + geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": diff --git a/petr_config.ini b/petr_config.ini new file mode 100644 index 0000000..9ad6cd5 --- /dev/null +++ b/petr_config.ini @@ -0,0 +1,106 @@ +# Configuration file for release version of PETRARCH event coder +# Codes the GigaWord.sample.PETR.txt using current dictionaries and default options +# Last update: 30 April 2015 + +[Dictionaries] +# See the PETRreader.py file for the purpose and format of these files +verbfile_name = CAMEO.2.0.txt +actorfile_list = Phoenix.Countries.actors.txt, Phoenix.International.actors.txt, Phoenix.MilNonState.actors.txt +agentfile_name = Phoenix.agents.txt +discardfile_name = Phoenix.discards.txt +issuefile_name = Phoenix.IssueCoding.txt + + + + +[Options] +# textfile_list is a comma-delimited list of text files to code. This list has priority if +# both a textfile_list and textfile_name are present +textfile_list = data/text/GigaWord.sample.PETR.xml +#textfile_list = AFP0808-01.xml, AFP0909-01.xml, AFP1210-01.xml +# textfile_name is the name of a file containing a list of names of files to code, one +# file name per line. +#textfile_name = PETR.textfiles.benchmark.txt + +# eventfile_name is the output file for the events +eventfile_name = events.PETR-Demo.txt + + +# INTERFACE OPTIONS: uncomment to activate +# Default: set all of these false, which is equivalent to an A)utocode in TABARI + +# code_by_sentence: show events after each sentence has been coded; default is to +# show events after all of the sentences in a story have been coded +code_by_sentence = True +# pause_by_sentence: pause after the coding of each sentence. Entering 'Return' will +# cause the next sentence to be coded; entering any character will +# cause the program to exit. Default is to code without any pausing. +#pause_by_sentence = True +# pause_by_story: pause after the coding of each story. +#pause_by_story = True + + +# CODING OPTIONS: +# Defaults are more or less equivalent to TABARI + +# write_actor_root: If True, the event record will include the text of the actor root: +# The root is the text at the head of the actor synonym set in the +# dictionary. Default is False +write_actor_root = False + +# write_actor_text: If True, the event record will include include the complete text of +# the noun phrase that was used to identify the actor. Default is False +write_actor_text = True + +# write_event_text: If True, the event record will include include the complete text of +# the verb phrase that was used to identify the event. Default is False +write_event_text = True + +# NULL CODING OPTIONS +# null_verbs: If True, only get verb phrases that are not in the dictionary but are associated +# with coded noun phrases +null_verbs = False + +# null_actors: If True, only get actor phrases that are not in the dictionary but associated with +# coded verb phrases. This also requires new_actor_length to be set to a value > 0: +# typically a value of 4 to 8 will give good results. +null_actors = False + +# new_actor_length: Maximum length for new actors extracted from noun phrases if no +# actor or agent generating a code is found. To disable and just +# use null codes "---", set to zero; this is the default. +# Setting this to a large number will extract anything found in a (NP +# noun phrase, though usually true actors contain a small number of words +# This must be an integer. +new_actor_length = 0 + +# require_dyad: Events require a non-null source and target: setting this false is likely +# to result in a very large number of nonsense events. As happened with the +# infamous GDELT data set of 2013-2014. And certainly no one wants to see +# that again. So the default is True +require_dyad = False + +# stop_on_error: If True, parsing errors causing the program to halt; typically used for +# debugging. With the default [false], the error is written to the error +# file, record is skipped, and processing continues. +stop_on_error = False + +# commas: These adjust the length (in words) of comma-delimited clauses that are eliminated +# from the parse. To deactivate, set the max to zero. +# Defaults, based on TABARI, are in () +# comma_min : internal clause minimum length [2] +# comma_max : internal clause maximum length [8] +# comma_bmin : initial ("begin") clause minimum length [0] +# comma_bmax : initial clause maximum length [0 : deactivated by default] +# comma_emin : terminal ("end") clause minimum length [2] +# comma_emax : terminal clause maximum length [8] +comma_min = 2 +comma_max = 8 +comma_bmin = 0 +comma_bmax = 0 +comma_emin = 2 +comma_emax = 8 + +[StanfordNLP] +stanford_dir = ~/stanford-corenlp/ + diff --git a/pipeline.py b/pipeline.py index d2cd443..60b65a9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import sys import logging +import requests import datetime import dateutil import uploader @@ -11,10 +12,9 @@ import oneaday_filter import result_formatter import scraper_connection -from petrarch import petrarch -def main(file_details, server_details, logger_file=None, run_filter=None, +def main(file_details, geo_details, server_details, petrarch_version, mongo_details, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. @@ -25,10 +25,16 @@ def main(file_details, server_details, logger_file=None, run_filter=None, file_details: Named tuple. All the other config information not in ``server_details``. + geo_details: Named tuple. + Settings for geocoding. + server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. + petrarch_version: String. + Which version of Petrarch to use. Must be '1' or '2' + logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working @@ -50,6 +56,16 @@ def main(file_details, server_details, logger_file=None, run_filter=None, # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') + if petrarch_version == '1': + from petrarch import petrarch + logger.info("Using original Petrarch version") + elif petrarch_version == '2': + from petrarch2 import petrarch2 as petrarch + logger.info("Using Petrarch2") + else: + logger.error("Invalid Petrarch version. Argument must be '1' or '2'") + + print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: @@ -66,9 +82,20 @@ def main(file_details, server_details, logger_file=None, run_filter=None, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) - results, scraperfilename = scraper_connection.main(process_date, file_details) + if geo_details.geo_service == "Mordecai": + dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) + try: + out = requests.get(dest) + assert out.status_code == 200 + except (AssertionError, requests.exceptions.ConnectionError): + print("Mordecai geolocation service not responding. Continuing anyway...") + elif geo_details.geo_service == "CLIFF": + print("CLIFF") + else: + print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") + if scraperfilename: logger.info("Scraper file name: " + scraperfilename) @@ -78,22 +105,21 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) - logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') - #Command to write output to a file directly from PETR -# petrarch.run_pipeline(formatted, -# '{}{}.txt'.format(file_details.fullfile_stem, -# date_string), parsed=True) - petr_results = petrarch.run_pipeline(formatted, write_output=False, + # Command to write output to a file directly from PETR + # petrarch.run_pipeline(formatted, + # '{}{}.txt'.format(file_details.fullfile_stem, + # date_string), parsed=True) + petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') - petr_results = petrarch.run_pipeline(formatted, write_output=False, + petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix @@ -114,7 +140,7 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, - server_details) + server_details, geo_details) else: print("Please specify a data version number. Program ending.") @@ -131,9 +157,10 @@ def main(file_details, server_details, logger_file=None, run_filter=None, print('PHOX.pipeline end:', datetime.datetime.utcnow()) -if __name__ == '__main__': - # initialize the various utilities globals - server_details, file_details = utilities.parse_config('PHOX_config.ini') - - main(file_details, server_details, file_details.log_file, +def run(): + server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') + main(file_details, geo_details, server_details, petrarch_version, file_details.log_file, run_filter=file_details.oneaday_filter, version='v0.0.0') + +if __name__ == '__main__': + run() diff --git a/pipeline_sched.py b/pipeline_sched.py new file mode 100644 index 0000000..35c4850 --- /dev/null +++ b/pipeline_sched.py @@ -0,0 +1,9 @@ +from pipeline import run + +from apscheduler.schedulers.blocking import BlockingScheduler +from apscheduler.triggers.cron import CronTrigger + +if __name__ == '__main__': + scheduler = BlockingScheduler() + scheduler.add_job(run, CronTrigger(minute=0, hour=1)) + scheduler.start() diff --git a/postprocess.py b/postprocess.py index 09d8943..0ad7d29 100644 --- a/postprocess.py +++ b/postprocess.py @@ -272,33 +272,34 @@ def process_actors(event): one of GOV, MIL, REB, OPP, PTY, COP, JUD, SPY, MED, EDU, BUS, CRM, or CVL. The ``others`` contains all other actor or agent codes. """ - countries = ('ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG', - 'ARM', 'ASM', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', - 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', - 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', - 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', - 'CPV', 'CRI', 'CUB', 'CUW', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', - 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', - 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', - 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', - 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', - 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', - 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', - 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', - 'LBR', 'LBY', 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', - 'MAC', 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', - 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', - 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', 'NCL', 'NER', - 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', - 'OMN', 'PAK', 'PAN', 'PCN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', - 'PRI', 'PRK', 'PRT', 'PRY', 'PSE', 'PYF', 'QAT', 'REU', 'ROU', - 'RUS', 'RWA', 'SAU', 'SCG', 'SDN', 'SEN', 'SGP', 'SHN', 'SJM', - 'SLB', 'SLE', 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', - 'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', - 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', 'TTO', - 'TUN', 'TUR', 'TWN', 'TUV', 'TZA', 'UGA', 'UKR', 'URY', 'USA', - 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', - 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE', 'ATG', 'AUS') + countries = ('ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', + 'ARG', 'ARM', 'ASM', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BES', + 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', + 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BYS', 'BWA', 'CAF', + 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COK', + 'COL', 'COM', 'CPV', 'CRI', 'CSK', 'CUB', 'CUW', 'CYM', 'CYP', + 'CZE', 'DDR', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', + 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', + 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY', 'GHA', 'GIB', + 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', + 'GUF', 'GUM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', + 'IMN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', + 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', + 'KOR', 'KSV', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LIE', + 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAF', 'MAR', 'MCO', + 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', + 'MNE', 'MNG', 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', + 'MYS', 'MYT', 'NAM', 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', + 'NLD', 'NOR', 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', + 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', + 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', 'SCG', + 'SDN', 'SEN', 'SGP', 'SHN', 'SJM', 'SLB', 'SLE', 'SLV', 'SMR', + 'SOM', 'SPM', 'SRB', 'SSD', 'SUN', 'STP', 'SUR', 'SVK', 'SVN', + 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', 'TCD', 'TGO', 'THA', + 'TJK', 'TMP', 'TKL', 'TKM', 'TLS', 'TON', 'TTO', 'TUN', 'TUR', + 'TWN', 'TUV', 'TZA', 'UGA', 'UKR', 'URY', 'USA', 'UZB', 'VAT', + 'VCT', 'VEN', 'VGB', 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', + 'YMD', 'YUG', 'ZAF', 'ZAR', 'ZMB', 'ZWE', 'ATG', 'AUS') root_actors = ('IGO', 'NGO', 'IMG', 'MNC') primary_agent = ('GOV', 'MIL', 'REB', 'OPP', 'PTY', 'COP', 'JUD', 'SPY', 'MED', 'EDU', 'BUS', 'CRM', 'CVL') @@ -357,7 +358,7 @@ def process_actors(event): return actors -def main(event_dict, this_date, version, file_details, server_details): +def main(event_dict, this_date, version, file_details, server_details, geo_details): """ Pulls in the coded results from PETRARCH dictionary in the {StoryID: [(record), (record)]} format and allows only one unique @@ -380,12 +381,21 @@ def main(event_dict, this_date, version, file_details, server_details): file_details: NamedTuple. Container generated from the config file specifying file stems and other relevant options. + server_details: NamedTuple. + Info for uploading to server. + geo_details: NamedTuple. + Info about geo type and geo server details. """ logger = logging.getLogger('pipeline_log') logger.info('Geolocating.') print('Geolocating') - updated_events = geolocation.main(event_dict, file_details, server_details) + if geo_details.geo_service == "CLIFF": + updated_events = geolocation.cliff(event_dict, file_details, server_details, geo_details) + elif geo_details.geo_service == "Mordecai": + updated_events = geolocation.mordecai(event_dict, file_details, server_details, geo_details) + else: + print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'.") logger.info('Formatting events for output.') event_write = create_strings(updated_events, version) diff --git a/requirements.txt b/requirements.txt index 8afc297..cf173b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,21 @@ +APScheduler==3.0.3 +corenlp-python==3.2.0.post3 +docutils==0.11 +futures==2.2.0 Jinja2==2.7.2 MarkupSafe==0.23 -PyYAML==3.11 -Pygments==1.6 -Sphinx==1.2.2 -Unidecode==0.04.16 -corenlp-python==3.2.0-3 -docutils==0.11 numpydoc==0.4 -petrarch==0.01a +-e git+https://github.com/openeventdata/petrarch.git@5fe92b676e8b4fb9a1964e7da55e9c95e9a5745f#egg=petrarch-master +-e git+https://github.com/openeventdata/petrarch2.git#egg=petrarch2-master pexpect==3.2 +Pygments==1.6 pymongo==2.7 python-dateutil==2.2 +pytz==2015.2 +PyYAML==3.11 requests==2.3.0 six==1.6.1 -wsgiref==0.1.2 +Sphinx==1.2.2 +tzlocal==1.1.3 +Unidecode==0.4.16 xmltodict==0.9.0 diff --git a/scraper_connection.py b/scraper_connection.py index 331fa0d..a95e14f 100644 --- a/scraper_connection.py +++ b/scraper_connection.py @@ -74,6 +74,7 @@ def query_all(collection, lt_date, gt_date, sources, write_file=False): posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) + #posts = collection.find() print('Total number of stories: {}'.format(posts.count())) logger.info('Total number of stories: {}'.format(posts.count())) @@ -145,8 +146,9 @@ def main(current_date, file_details, write_file=False, file_stem=None): """ sources = _get_sources('source_keys.txt') - conn = utilities.make_conn(file_details.auth_db, file_details.auth_user, - file_details.auth_pass) + conn = utilities.make_conn(file_details.db_db, file_details.db_collection, + file_details.auth_db, file_details.auth_user, + file_details.auth_pass, file_details.db_host) less_than = datetime.datetime(current_date.year, current_date.month, current_date.day) diff --git a/tests/test_geolocation.py b/tests/test_geolocation.py new file mode 100644 index 0000000..54877d8 --- /dev/null +++ b/tests/test_geolocation.py @@ -0,0 +1,13 @@ +from bson.objectid import ObjectId +import datetime +import sys +import os +sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../") +import geolocation +import utilities + +def test_geo_config(): + server_details, geo_details, file_details, petrarch_version = utilities.parse_config('PHOX_config.ini') + geo_keys = geo_details._asdict().keys() + assert geo_keys == ['geo_service', 'cliff_host', 'cliff_port', 'mordecai_host', 'mordecai_port'] + diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..659ae4e --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,91 @@ +from bson.objectid import ObjectId +import datetime +from petrarch import petrarch +from petrarch2 import petrarch2 + +formatted = [{u'language': u'english', +u'title': u'6 killed in attacks in Iraqi capital Friday', +u'url': u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', +u'stanford': 1, +u'content': u'BAGHDAD: At least six people, including a soldier, were killed in a spate of attacks across Iraqi capital Baghdad on Friday. A sniper opened fire on soldiers manning a checkpoint in southern Baghdad, killing a soldier and injuring three others, police officer Nader al-Janabi told Anadolu Agency. Two civilians were killed and six others injured in a bomb blast in al-Zafarana district in south-eastern Baghdad, he said. Three more civilians were killed and seven others injured in two bomb blasts in southern and northern Baghdad, according to al-Janabi. Iraqi officials often blame the attacks on the Daesh terrorist group, which overran vast swathes of territory in Iraq in 2014. ', +u'source': u'menafn_iraq', +u'parsed_sents': [u'(ROOT (S (NP (NNP BAGHDAD)) (: :) (NP (NP (QP (IN At) (JJS least) (CD six)) (NNS people)) (, ,) (PP (VBG including) (NP (DT a) (NN soldier))) (, ,)) (VP (VBD were) (VP (VBN killed) (PP (IN in) (NP (NP (DT a) (NN spate)) (PP (IN of) (NP (NNS attacks))))) (PP (IN across) (NP (JJ Iraqi) (NN capital) (NNP Baghdad))) (PP (IN on) (NP (NNP Friday))))) (. .)))', +u'(ROOT (S (NP (DT A) (NN sniper)) (VP (VBD opened) (NP (NN fire)) (PP (IN on) (S (S (NP (NNS soldiers)) (VP (VBG manning) (NP (NP (DT a) (NN checkpoint)) (PP (IN in) (NP (JJ southern) (NNP Baghdad)))) (, ,) (S (VP (VP (VBG killing) (NP (DT a) (NN soldier))) (CC and) (VP (VBG injuring) (NP (CD three) (NNS others))))))) (, ,) (NP (NNS police) (NN officer) (NNP Nader) (NNP al-Janabi)) (VP (VBD told) (NP (NNP Anadolu) (NNP Agency))) (. .))))))', +u'(ROOT (S (S (NP (CD Two) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD six) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (DT a) (NN bomb) (NN blast)) (PP (IN in) (NP (NP (NN al-Zafarana) (NN district)) (PP (IN in) (NP (JJ south-eastern) (NNP Baghdad)))))))))))) (, ,) (NP (PRP he)) (VP (VBD said)) (. .)))', +u'(ROOT (S (NP (CD Three) (JJR more) (NNS civilians)) (VP (VBD were) (VP (VP (VBN killed)) (CC and) (NP (NP (CD seven) (NNS others)) (VP (VBN injured) (PP (IN in) (NP (NP (CD two) (NN bomb) (NNS blasts)) (PP (IN in) (NP (ADJP (JJ southern) (CC and) (JJ northern)) (NNP Baghdad))))))) (, ,) (PP (VBG according) (PP (TO to) (NP (NNP al-Janabi)))))) (. .)))', +u'(ROOT (S (NP (JJ Iraqi) (NNS officials)) (ADVP (RB often)) (VP (VBP blame) (NP (NP (DT the) (NNS attacks)) (PP (IN on) (NP (NP (DT the) (NNP Daesh) (JJ terrorist) (NN group)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD overran) (NP (NP (JJ vast) (NNS swathes)) (PP (IN of) (NP (NP (NN territory)) (PP (IN in) (NP (NNP Iraq)))))) (PP (IN in) (NP (CD 2014)))))))))) (. .)))'], +u'date': u'160626', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 17, 640000), +u'_id': ObjectId('57702641172ab87eb7dc98fa')}, +{u'language': u'english', +u'title': u'Soldiers, Policemen Fight Over Rice', +u'url': u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', +u'stanford': 1, +u'content': u'There was chaos at the Borno State Government House in Maiduguri, yesterday, as soldiers and policemen engaged in gun battle over rice meant for internally displaced persons. The Government House is besieged daily by thousands of internally displaced persons within Maiduguri metropolis, who choose to stay outside of the designated camps. The IDPs, who queue for hours to receive rice and other relief items, often cause gridlock around the Government House with many of them having to go back empty handed each day. The situation, however, turned violent, yesterday afternoon when the soldiers that were deployed to maintain law and order tried to benefit from the largese. The soldiers were said to have tried to force their way into the Deputy Governor\u2019s office, the place designated for the distribution, to get their vehicles filled. An attempt by the mobile policemen attached to the office to prevent the soldiers from achieving their goal led to a shootout. It was gathered that the soldiers fired several warning shots and the mobile policemen shot back in return, while also firing canisters of tear gas. Lucky Irabor, to get the furious soldiers to withdraw from the battle, which caused panic across Maiduguri. It was gathered that Irabor, the most senior military officer around, and the Commissioner of Police, Aminchi Baraya, subsequently visited the injured policeman at the hospital.', +u'source': u'nigeria_tidenews', +u'parsed_sents': [u'(ROOT (S (NP (EX There)) (VP (VBD was) (NP (NP (NN chaos)) (PP (IN at) (NP (NP (DT the) (NNP Borno) (NNP State) (NNP Government) (NNP House)) (PP (IN in) (NP (NNP Maiduguri))))) (, ,) (NP (NN yesterday)) (, ,)) (PP (IN as) (NP (NP (NNS soldiers) (CC and) (NNS policemen)) (VP (VBN engaged) (PP (IN in) (NP (NP (NN gun) (NN battle)) (PP (IN over) (NP (NP (NN rice)) (VP (VBN meant) (PP (IN for) (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)))))))))))) (. .)))', +u'(ROOT (S (NP (DT The) (NNP Government) (NNP House)) (VP (VBZ is) (VP (VBN besieged) (ADVP (RB daily)) (PP (IN by) (NP (NP (NNS thousands)) (PP (IN of) (NP (NP (ADJP (RB internally) (JJ displaced)) (NNS persons)) (PP (IN within) (NP (NP (NNP Maiduguri) (NN metropolis)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBP choose) (S (VP (TO to) (VP (VB stay) (ADVP (IN outside) (PP (IN of) (NP (DT the) (VBN designated) (NNS camps)))))))))))))))))) (. .)))', +u'(ROOT (NP (NP (NP (DT The) (NNS IDPs)) (, ,) (SBAR (WHNP (WP who)) (S (VP (VB queue) (SBAR (IN for) (S (NP (NNS hours)) (VP (TO to) (VP (VB receive) (NP (NP (NN rice) (CC and) (JJ other) (NN relief) (NNS items)) (, ,) (S (ADVP (RB often)) (VP (VBP cause) (NP (NN gridlock)) (PP (IN around) (S (NP (NP (DT the) (NNP Government) (NNP House)) (PP (IN with) (NP (NP (JJ many)) (PP (IN of) (NP (PRP them)))))) (VP (VBG having) (S (VP (TO to) (VP (VB go) (NP (ADJP (RB back) (JJ empty)) (NN handed)) (NP (DT each) (NN day))))))))))))))))))) (. .)))', +u'(ROOT (S (S (NP (DT The) (NN situation)) (, ,) (ADVP (RB however)) (, ,) (VP (VBD turned) (ADJP (JJ violent)))) (, ,) (NP (NP (NN yesterday) (NN afternoon)) (SBAR (WHADVP (WRB when)) (S (NP (NP (DT the) (NNS soldiers)) (SBAR (WHNP (WDT that)) (S (VP (VBD were) (VP (VBN deployed) (S (VP (TO to) (VP (VP (VB maintain) (NP (NN law))) (CC and) (VP (NN order) (VP (VBD tried) (S (VP (TO to) (VP (VB benefit) (PP (IN from) (NP (DT the) (NN largese))))))))))))))))))) (. .)))', +u"(ROOT (S (NP (DT The) (NNS soldiers)) (VP (VBD were) (VP (VBN said) (S (VP (TO to) (VP (VB have) (VP (VBN tried) (S (VP (TO to) (VP (VB force) (NP (PRP$ their) (NN way)) (PP (IN into) (NP (NP (NP (DT the) (NNP Deputy) (NNP Governor) (POS 's)) (NN office)) (, ,) (NP (NP (DT the) (NN place)) (VP (VBN designated) (PP (IN for) (NP (DT the) (NN distribution))))) (, ,)))))) (S (VP (TO to) (VP (VB get) (S (NP (PRP$ their) (NNS vehicles)) (VP (VBN filled)))))))))))) (. .)))", +u'(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN by) (NP (NP (DT the) (JJ mobile) (NNS policemen)) (VP (VBN attached) (PP (TO to) (NP (DT the) (NN office))) (S (VP (TO to) (VP (VB prevent) (NP (DT the) (NNS soldiers)) (PP (IN from) (S (VP (VBG achieving) (NP (PRP$ their) (NN goal)))))))))))) (VP (VBD led) (PP (TO to) (NP (DT a) (NN shootout)))) (. .)))', +u'(ROOT (S (NP (PRP It)) (VP (VBD was) (VP (VBN gathered) (SBAR (IN that) (S (NP (DT the) (NNS soldiers)) (VP (VBD fired) (SBAR (S (NP (NP (JJ several) (VBG warning) (NNS shots)) (CC and) (NP (DT the) (JJ mobile) (NNS policemen))) (VP (VBD shot) (ADVP (RB back)) (PP (IN in) (NP (NN return))) (, ,) (SBAR (IN while) (S (ADVP (RB also)) (VP (NN firing) (NP (NP (NNS canisters)) (PP (IN of) (S (VP (VB tear) (NP (NN gas))))))))))))))))) (. .)))'], +u'date': u'160624', +u'date_added': datetime.datetime(2016, 6, 26, 19, 0, 18), +u'_id': ObjectId('57702642172ab87eb5dc98e9')}, +{ "_id" : ObjectId("57702678172ab87ec2dc9933"), +"content" : "BAGHDAD - A senior Iraqi commander said the city of Fallujah was \"fully liberated\" from Islamic State of Iraq and Syria (ISIS) militants on Sunday, after a more than monthlong military operation. Iraqi troops have entered the northwestern al-Julan neighborhood, the last area of Fallujah to remain under ISIS control, the head of the counterterrorism forces in the operation, Lt. Gen. Abdul-Wahab al-Saadi, told The Associated Press. Al-Saadi said the operation, which began in late May, \"is done and the city is fully liberated.\" The Iraqi army was backed by U.S.-led airstrikes and paramilitary troops, mostly Shiite militias. \"From the center of al-Julan neighborhood, we congratulate the Iraqi people and the commander in chief...and declare that the Fallujah fight is over,\" he told Iraqi state TV, flanked by military officers and soldiers. Some of the soldiers were shooting in the air, chanting and waving the Iraqi flag. He added that troops will start working on removing bombs from the city's streets and buildings. In a statement, the U.S. central military command overseeing the U.S.-led coalition in Iraq said: \"The Coalition continues to provide support through strikes, intelligence, and advice and assistance to the Iraqi Security Forces operating in Fallujah and will continue to do so through deliberate clearing operations.\" Prime Minister Haider al-Abadi declared victory in Fallujah over a week ago, after Iraqi forces advanced into the city center and took control of a government complex. He pledged that remaining pockets of ISIS fighters would be cleared out within hours, but fierce clashes on the city's northern and western edges persisted for days. Tens of thousands of people have fled the fighting, overwhelming camps for the displaced run by the government and aid groups. According to the U.N. refugee agency, more than 85,000 people have fled Fallujah and the surrounding area since the offensive began. The UNHCR and others have warned of dire conditions in the camps -- where temperatures are well over 40 degrees Celsius (104 F) and shelter is limited -- and have called for more funds to meet mounting needs. Fallujah, which is located about 40 miles west of Baghdad, was the first city to fall to IS, in January 2014. Fallujah was also a stronghold of Sunni insurgents following the U.S.-led invasion in 2003. More than 100 American soldiers died and hundreds more were wounded in intense, house-by-house fighting in Fallujah in 2004. ISIS extremists still control significant areas in northern and western Iraq, including the country's second-largest city, Mosul. The group declared an Islamic caliphate on the territory it holds in Iraq and Syria and at the height of its power was estimated to hold nearly a third of each country. More than 3.3 million Iraqis have fled their homes since ISIS swept across northern and western Iraq in the summer of 2014, according to U.N. figures. More than 40 percent of the displaced are from Anbar province, where Fallujah is located.", +"source" : "cbs_world", +"date" : "Sun, 26 Jun 2016 17:37:27 -0400", +"language" : "english", +"title" : "Iraq: Fallujah \"fully liberated\" after monthlong fight", +"url" : "http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/", +"date_added" : datetime.datetime(2016, 6, 26, 19, 0, 18), +"stanford" : 1, +"parsed_sents" : [ "(ROOT (S (NP (NNP BAGHDAD) (: -) (NN A) (JJ senior) (JJ Iraqi) (NN commander)) (VP (VBD said) (SBAR (S (NP (NP (DT the) (NN city)) (PP (IN of) (NP (NNP Fallujah)))) (VP (VBD was) (`` ``) (VP (ADVP (RB fully)) (VBN liberated) ('' '') (PP (IN from) (NP (NP (JJ Islamic) (NN State) (PP (IN of) (NP (NP (NNP Iraq)) (CC and) (NP (NNP Syria) (PRN (-LRB- -LRB-) (NNP ISIS) (-RRB- -RRB-)) (NNS militants))))) (PP (IN on) (NP (NNP Sunday)))))) (, ,) (PP (IN after) (NP (DT a) (ADVP (JJR more) (IN than)) (JJ monthlong) (JJ military) (NN operation))))))) (. .)))", +"(ROOT (S (S (NP (JJ Iraqi) (NNS troops)) (VP (VBP have) (VP (VBN entered) (NP (DT the) (JJ northwestern) (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (NP (NP (DT the) (JJ last) (NN area)) (PP (IN of) (NP (NNP Fallujah))) (S (VP (TO to) (VP (VB remain) (PP (IN under) (NP (NNP ISIS) (NN control))))))) (, ,) (NP (NP (DT the) (NN head)) (PP (IN of) (NP (NP (DT the) (NN counterterrorism) (NNS forces)) (PP (IN in) (NP (DT the) (NN operation)))))) (, ,) (NP (NNP Lt.) (NNP Gen.) (NNP Abdul-Wahab) (NNP al-Saadi)) (, ,)) (VP (VBD told) (NP (DT The) (NNP Associated) (NNP Press))) (. .)))", +"(ROOT (S (NP (NNP Al-Saadi)) (VP (VBD said) (SBAR (S (S (NP (NP (DT the) (NN operation)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (VBD began) (PP (IN in) (NP (JJ late) (NNP May)))))) (, ,)) (`` ``) (VP (VBZ is) (VP (VBN done)))) (CC and) (S (NP (DT the) (NN city)) (VP (VBZ is) (ADVP (RB fully)) (VP (VBN liberated))))))) (. .) ('' '')))", +"(ROOT (S (`` ``) (S (PP (IN From) (NP (NP (DT the) (NN center)) (PP (IN of) (NP (JJ al-Julan) (NN neighborhood))))) (, ,) (NP (PRP we)) (VP (VP (VBP congratulate) (NP (NP (DT the) (JJ Iraqi) (NNS people)) (CC and) (NP (NP (DT the) (NN commander)) (PP (IN in) (NP (NN chief)))))) (: ...) (CC and) (VP (VB declare) (SBAR (IN that) (S (NP (DT the) (NNP Fallujah) (NN fight)) (VP (VBZ is) (ADVP (IN over)))))))) (, ,) ('' '') (NP (PRP he)) (VP (VBD told) (NP (JJ Iraqi) (NN state) (NN TV)) (, ,) (S (VP (VBN flanked) (PP (IN by) (NP (JJ military) (NNS officers) (CC and) (NNS soldiers)))))) (. .)))", +"(ROOT (S (PP (IN In) (NP (DT a) (NN statement))) (, ,) (NP (NP (DT the) (NNP U.S.) (JJ central) (JJ military) (NN command)) (VP (VBG overseeing) (NP (NP (DT the) (JJ U.S.-led) (NN coalition)) (PP (IN in) (NP (NNP Iraq)))))) (VP (VBD said) (: :) (`` ``) (S (NP (DT The) (NNP Coalition)) (VP (VP (VBZ continues) (S (VP (TO to) (VP (VB provide) (NP (NN support)) (PP (IN through) (NP (NP (NP (NNS strikes)) (, ,) (NP (NN intelligence)) (, ,) (CC and) (NP (NN advice))) (CC and) (NP (NP (NN assistance)) (PP (TO to) (NP (NP (DT the) (JJ Iraqi) (NN Security) (NNS Forces)) (VP (VBG operating) (PP (IN in) (NP (NNP Fallujah))))))))))))) (CC and) (VP (MD will) (VP (VB continue) (S (VP (TO to) (VP (VB do) (ADVP (RB so))))) (PP (IN through) (NP (JJ deliberate) (NN clearing) (NNS operations)))))))) (. .) ('' '')))", +"(ROOT (S (NP (PRP He)) (VP (VP (VBD pledged) (SBAR (IN that) (S (NP (NP (VBG remaining) (NNS pockets)) (PP (IN of) (NP (NNP ISIS) (NNS fighters)))) (VP (MD would) (VP (VB be) (VP (VBN cleared) (PRT (RP out)) (PP (IN within) (NP (NNS hours))))))))) (, ,) (CC but) (S (NP (NP (JJ fierce) (NNS clashes)) (PP (IN on) (NP (NP (DT the) (NN city) (POS 's)) (ADJP (JJ northern) (CC and) (JJ western)) (NNS edges)))) (VP (VBD persisted) (PP (IN for) (NP (NNS days)))))) (. .)))", +"(ROOT (S (NP (NP (NNS Tens)) (PP (IN of) (NP (NP (NNS thousands)) (PP (IN of) (NP (NNS people)))))) (VP (VBP have) (VP (VBN fled) (NP (NP (DT the) (NN fighting)) (, ,) (NP (NP (JJ overwhelming) (NNS camps)) (PP (IN for) (NP (DT the) (JJ displaced) (NN run)))) (PP (IN by) (NP (DT the) (NN government) (CC and) (NN aid) (NNS groups)))))) (. .)))" ] }] + +def test_petr1_formatted_to_results(): + petr1_results = petrarch.run_pipeline(formatted, write_output=False, + parsed=True) + correct1_results = {'57702678172ab87ec2dc9933': + [(u'20160626', u'IRQ', u'MED', u'010', u'57702678172ab87ec2dc9933_1', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world'), + (u'20160626', u'IRQMIL', u'IRQ', u'010', u'NAMED_TERROR_GROUP,1', + u'57702678172ab87ec2dc9933_0', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world') + ]} + assert petr1_results == correct1_results + +def test_petr2_formatted_to_results(): + petr2_results = petrarch2.run_pipeline(formatted, write_output=False, + parsed=True) + correct2_results = {'57702678172ab87ec2dc9933': + [(u'20160626', u'IRQMIL', u'MED', u'010', u'57702678172ab87ec2dc9933_1', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world'), + (u'20160626', u'IRQMIL', u'IRQ', u'010', u'NAMED_TERROR_GROUP,1', u'57702678172ab87ec2dc9933_0', + 'http://www.cbsnews.com/news/iraqi-commander-fallujah-fully-liberated-after-a-month/', + 'cbs_world') + ], + '57702642172ab87eb5dc98e9': + [(u'20160624', u'NGAPPL', u'---GOV', u'191', u'REFUGEES,1', u'57702642172ab87eb5dc98e9_1', + u'http://www.thetidenewsonline.com/2016/06/24/soldiers-policemen-fight-over-rice/', + u'nigeria_tidenews')], + '57702641172ab87eb7dc98fa': + [(u'20160626', u'IRQ', u'IMGMUSISIUAF', u'111', u'TERROR,1', + u'57702641172ab87eb7dc98fa_4', + u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', + u'menafn_iraq'), + (u'20160626', u'---CVL', u'IRQ', u'190', u'57702641172ab87eb7dc98fa_3', + u'http://www.menafn.com/1094827896/6-killed-in-attacks-in-Iraqi-capital-Friday?src=RSS', + u'menafn_iraq')]} + assert petr2_results == correct2_results + diff --git a/utilities.py b/utilities.py index f9b5c5e..a0b1fc1 100644 --- a/utilities.py +++ b/utilities.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import re import logging +import os from collections import namedtuple + from pymongo import MongoClient try: @@ -30,8 +32,14 @@ def parse_config(config_filename): Config information specifically related to the remote server for FTP uploading. + geo_list : Named tuple. + Config information for geocoding. + file_list: Named tuple. All the other config information not in ``server_list``. + + petrarch_version: Int + Either 1 or 2, indicating whether Petrarch or Petrarch2 should be used. """ parser = ConfigParser() parser.read(config_filename) @@ -42,17 +50,31 @@ def parse_config(config_filename): username = parser.get('Server', 'username') password = parser.get('Server', 'password') server_dir = parser.get('Server', 'server_dir') - cliff_host = parser.get('Server', 'cliff_host') - cliff_port = parser.get('Server', 'cliff_port') server_attrs = namedtuple('ServerAttributes', ['serv_name', 'username', 'password', - 'server_dir', - 'cliff_host', - 'cliff_port']) + 'server_dir']) server_list = server_attrs(serv_name, username, password, - server_dir, cliff_host, cliff_port) + server_dir) + + geo_service = parser.get('Geolocation', 'geo_service') + cliff_host = parser.get('Geolocation', 'cliff_host') + cliff_port = parser.get('Geolocation', 'cliff_port') + mordecai_host = parser.get('Geolocation', 'mordecai_host') + mordecai_port = parser.get('Geolocation', 'mordecai_port') + + + geo_attrs = namedtuple('GeolocationAttributes', ['geo_service', + 'cliff_host', + 'cliff_port', + 'mordecai_host', + 'mordecai_port' + ]) + + geo_list = geo_attrs(geo_service, cliff_host, cliff_port, + mordecai_host, mordecai_port) + # these are listed in the order generated scraper_stem = parser.get('Pipeline', 'scraper_stem') @@ -66,15 +88,28 @@ def parse_config(config_filename): auth_db = parser.get('Auth', 'auth_db') auth_user = parser.get('Auth', 'auth_user') auth_pass = parser.get('Auth', 'auth_pass') + db_host = parser.get('Auth', 'db_host') else: auth_db = '' auth_user = '' auth_pass = '' + db_host = os.getenv('MONGO_HOST') or None if 'Logging' in parser.sections(): log_file = parser.get('Logging', 'log_file') else: log_file = '' + petrarch_version = parser.get('Petrarch', 'petrarch_version') + + if 'Mongo' in parser.sections(): + db_db = parser.get('Mongo', 'db') + db_collection = parser.get('Mongo', 'collection') + else: + db_db = 'event_scrape' + db_collection = 'stories' + + + file_attrs = namedtuple('FileAttributes', ['scraper_stem', 'recordfile_stem', 'fullfile_stem', @@ -85,14 +120,19 @@ def parse_config(config_filename): 'log_file', 'auth_db', 'auth_user', - 'auth_pass']) + 'auth_pass', + 'db_host', + 'db_db', + 'db_collection']) file_list = file_attrs(scraper_stem, recordfile_stem, fullfile_stem, eventfile_stem, dupfile_stem, outputfile_stem, oneaday_filter, log_file, auth_db, auth_user, - auth_pass) + auth_pass, db_host, db_db, db_collection) + - return server_list, file_list + + return server_list, geo_list, file_list, petrarch_version except Exception as e: print('Problem parsing config file. {}'.format(e)) @@ -134,7 +174,7 @@ def do_RuntimeError(st1, filename='', st2=''): raise RuntimeError(st1 + ' ' + filename + ' ' + st2) -def make_conn(db_auth, db_user, db_pass): +def make_conn(db_db, db_collection, db_auth, db_user, db_pass, db_host=None): """ Function to establish a connection to a local MonoDB instance. @@ -158,11 +198,15 @@ def make_conn(db_auth, db_user, db_pass): Collection within MongoDB that holds the scraped news stories. """ - client = MongoClient() + + if db_host: + client = MongoClient(db_host) + else: + client = MongoClient() if db_auth: client[db_auth].authenticate(db_user, db_pass) - database = client.event_scrape - collection = database['stories'] + database = client[db_db] + collection = database[db_collection] return collection