| 
View
 

config.py (2015-11-16).txt

File history uploaded by Alan Liu 8 years, 8 months ago
# -*- coding: utf-8 -*-
##########################################
# Configuration file for scrub.py        #
# Must be in the same folder as scrub.py #
# See README file for instructions       #
##########################################

# File Configurations
input_file_path = "C:/workspace/1/articles"
output_file_path = "C:/workspace/1/articles-scrubbed"
save_log = "true"

# Options
options = [
    # Iteration 1 -- Initial Processing
    {
	"Comment": "Standardizes all occurrences of \"United States of America\" to \"United States\".",
    "values": [
        {
		"find": "United States of America",
		"replace":"United States"
		}
        ]
    },
    # Iteration 2 -- Punctuation
    {
    "values": [
		# Double slashes required in replace for raw strings
        {"find": "\.(\w)", "replace": ". \\1"},
        {"find": "\.(\w)", "replace": ": \\1"},
        {"find": "\.(\w)", "replace": "? \\1"}		
        ]
    },
    # Iteration 3 -- Tokenisation
    {
    "values": [
        {
        "find": "Affordable Care Act",
        "replace": "Affordable_Care_Act"
        },
        {
        "find": "American Association of University Professors",
        "replace": "AAUP"
        },
        {
        "find": "American Studies Association",
        "replace": "American_Studies_Association"
        },
        {
        "find": "Art History",
        "replace": "Art_History"
        },
        {
        "find": "center-left",
        "replace": "center_left"
        },
        {
        "find": "centre-left",
        "replace": "center_left"
        },
        {
        "find": "Chronicle of Higher Education",
        "replace": "Chronicle_of_Higher_Education"
        },
        {
        "find": "Cold War",
        "replace": "Cold_War"
        },
        {
        "find": "Common Core",
        "replace": "Common_Core"
        },
        {"find": "Department of Education",
        "replace": "Department_of_Education"
        },
        {
        "find": "distance learning",
        "replace": "distance_learning"
        },
        {
        "find": "East Coast",
        "replace": "East_Coast"
        },
        {
        "find": "East Asian",
        "replace": "East_Asian"
        },
        {
        "find": "hard headed",
        "replace": "hard_headed"
        },
        {
        "find": "hard nosed",
        "replace": "hard_nosed"
        },
        {
        "find": "hard science",
        "replace": "hard_science"
        },
        {
        "find": "hard sciences",
        "replace": "hard_sciences"
        },
        {
        "find": "hard times",
        "replace": "hard_times"
        },
        {
        "find": "hard wired",
        "replace": "hard_wired"
        },
        {
        "find": "hard work",
        "replace": "hard_work"
        },
        {
        "find": "hard working",
        "replace": "hard_working"
        },
        {
        "find": "Harvard University",
        "replace": "Harvard_University"
        },
        {
        "find": "H\\. M\\. O\\.",
        "replace": "HMO"
        },
        {
        "find": "American left",
        "replace": "American_left"
        },
        {
        "find": "British left",
        "replace": "British_left"
        },
        {
        "find": "left leaning",
        "replace": "left_leaning"
        },
        {
        "find": "left wing",
        "replace": "left_wing"
        },
        {
        "find": "the left",
        "replace": "the_left"
        },
        {
        "find": "American right",
        "replace": "American_right"
        },
        {
        "find": "British right",
        "replace": "British_right"
        },
        {
        "find": "right brain",
        "replace": "right_brain"
        },
        {
        "find": "right leaning",
        "replace": "right_leaning"
        },
        {
        "find": "right wing",
        "replace": "right_wing"
        },
        {
        "find": "the right",
        "replace": "the_right"
        },
        {
        "find": "Letters to the Editor",
        "replace": "Letters_to_the_Editor"
        },
        {
        "find": "liberal art(\\.)",
        "replace": "liberal_arts"
        },
        {
        "find": "liberal arts",
        "replace": "liberal_arts"
        },
        {
        "find": "liberal-arts",
        "replace": "liberal_arts"
        },
        {
        "find": "liberal-art(\\.)",
        "replace": "liberal_arts"
        },
        {
        "find": "Long Island",
        "replace": "Long_Island"
        },
        {
        "find": "long term",
        "replace": "long_term"
        },
        {
        "find": "long-term",
        "replace": "long_term"
        },
        {
        "find": "Los Angeles",
        "replace": "los_angeles"
        },
        {
        "find": "English major(\\.)",
        "replace": "english_major"
        },
        {
        "find": "English majors",
        "replace": "english_major"
        },
        {
        "find": "History major(\\.)",
        "replace": "history_major"
        },
        {
        "find": "History majors",
        "replace": "history_major"
        },
        {
        "find": "Philosophy major(\\.)",
        "replace": "philosophy_major"
        },
        {
        "find": "Philosophy majors",
        "replace": "philosophy_major"
        },
        {
        "find": "French major(\\.)",
        "replace": "french_major"
        },
        {
        "find": "French majors",
        "replace": "french_major"
        },
        {
        "find": "Classics major(\\.)",
        "replace": "classics_major"
        },
        {
        "find": "Classics major",
        "replace": "classics_major"
        },
        {
        "find": "Art major(\\.)",
        "replace": "art_major"
        },
        {
        "find": "Art majors",
        "replace": "art_major"
        },
        {
        "find": "Arts major(\\.)",
        "replace": "arts_major"
        },
        {
        "find": "Arts majors",
        "replace": "arts_major"
        },
        {
        "find": "Language major(\\.)",
        "replace": "language_major"
        },
        {
        "find": "Language majors",
        "replace": "language_major"
        },
        {
        "find": "humanities major(\\.)",
        "replace": "humanities_major"
        },
        {
        "find": "humanities majors",
        "replace": "humanities_major"
        },
        {
        "find": "Art History major(\\.)",
        "replace": "art_history_major"
        },
        {
        "find": "Art History majors",
        "replace": "art_history_major"
        },
        {
        "find": "major in the humanities",
        "replace": "major_in_the_humanities"
        },
        {
        "find": "English minor(\\.)",
        "replace": "english_minor"
        },
        {
        "find": "English minors",
        "replace": "english_minor"
        },
        {
        "find": "History minor(\\.)",
        "replace": "history_minor"
        },
        {
        "find": "History minors",
        "replace": "history_minor"
        },
        {
        "find": "Philosophy minor(\\.)",
        "replace": "philosophy_minor"
        },
        {
        "find": "Philosophy minors",
        "replace": "philosophy_minor"
        },
        {
        "find": "French minor(\\.)",
        "replace": "french_minor"
        },
        {
        "find": "French minors",
        "replace": "french_minor"
        },
        {
        "find": "Classics minor(\\.)",
        "replace": "classics_minor"
        },
        {
        "find": "Classics minors",
        "replace": "classics_minor"
        },
        {
        "find": "Art minor(\\.)",
        "replace": "art_minor"
        },
        {
        "find": "Art minors",
        "replace": "art_minor"
        },
        {
        "find": "Arts minor(\\.)",
        "replace": "arts_minor"
        },
        {
        "find": "Arts minors",
        "replace": "arts_minor"
        },
        {
        "find": "Language minor(\\.)",
        "replace": "language_minor"
        },
        {
        "find": "Language minors",
        "replace": "language_minor"
        },
        {
        "find": "humanities minor(\\.)",
        "replace": "humanities_minor"
        },
        {
        "find": "humanities minors",
        "replace": "humanities_minors"
        },
        {
        "find": "Art History minor(\\.)",
        "replace": "art_history_minor"
        },
        {
        "find": "Art History minors",
        "replace": "art_history_minor"
        },
        {
        "find": "minor in the humanities",
        "replace": "minor_in_the_humanities"
        },
        {
        "find": "M\\. D\\. s",
        "replace": "M_D"
        },
        {
        "find": "M\\. D\\.",
        "replace": "M_D"
        },
        {
        "find": "Middle East",
        "replace": "Middle_East"
        },
        {
        "find": "Middle Eastern",
        "replace": "Middle_East"
        },
        {
        "find": "M\\. A\\.",
        "replace": "M_A"
        },
        {
        "find": "MacArthur Foundation",
        "replace": "MacArthur_Foundation"
        },
        {
        "find": "Modern Language Association",
        "replace": "MLA"
        },
        {
        "find": "National Endowment for the Humanities",
        "replace": "NEH"
        },
        {
        "find": "N\\. E\\. H\\.",
        "replace": "NEH"
        },
        {
        "find": "National Endowment for the Arts",
        "replace": "NEA"
        },
        {
        "find": "N\\. E\\. A\\.",
        "replace": "NEA"
        },
        {
        "find": "National Endowment for the Humanities",
        "replace": "National_Endowment_for_the_Humanities"
        },
        {
        "find": "National Humanities Center",
        "replace": "National_Humanities_Center"
        },
        {
        "find": "National Commission on Excellence in Education",
        "replace": "National_Commission_on_Excellence_in_Education"
        },
        {
        "find": "New Left",
        "replace": "new_left"
        },
        {
        "find": "New Jersey",
        "replace": "New_Jersey"
        },
        {
        "find": "New York",
        "replace": "New_York"
        },
        {
        "find": "North America",
        "replace": "North_America"
        },
        {
        "find": "North American",
        "replace": "North_American"
        },
        {
        "find": "part time",
        "replace": "part_time"
        },
        {
        "find": "Ph.D.",
        "replace": "PhD"
        },
        {
        "find": "queer studies",
        "replace": "queer_studies"
        },
        {
        "find": "queer theory",
        "replace": "queer_theory"
        },
        {
        "find": "Rockefeller Foundation",	        "replace": "Rockefeller_Foundation"
        },
        {
        "find": "social science(\\.)",
        "replace": "social_science\\1"
        },
        {
        "find": "social sciences",
        "replace": "social_sciences"
        },
        {
        "find": "social scientist",
        "replace": "social_scientist"
        },
        {
        "find": "social studies",
        "replace": "social_studies"
        },
        {
        "find": "South America",
        "replace": "South_America"
        },
        {
        "find": "South American",
        "replace": "South_America"
        },
        {
        "find": "The National Review",
        "replace": "The_National_Review"
        },
        {
        "find": "University of California",	        "replace": "University_of_California"
        },
        {
        "find": "University of Chicago",
        "replace": "University_of_Chicago"
        },
        {
        "find": "University of Bridgeport",
        "replace": "University_of_Bridgeport"
        },
        {
        "find": "U\\. S\\.",
        "replace": "U_S"
        },
        {
        "find": "United States",
        "replace": "U_S"
        },
        {
        "find": "Wall Street Journal",
        "replace": "Wall_Street_Journal"
        },
        {
        "find": "Yale University",
        "replace": "Yale_University"
        },
        {
        "find": "centre",
        "replace": "center"
        },
        {
        "find": "labour",
        "replace": "labor"
        },
        {
        "find": "organisations",
        "replace": "organizations"
        },
        {
        "find": "programme",
        "replace": "program"
        },
        {
        "find": "Associated Press",
        "replace": "[.]"
        },
        {
        "find": "Continue reading the main story",
        "replace": "[.]"
        },
        {
        "find": "Corrections & Amplifications",
        "replace": "[.]"
        },
        {
        "find": "Credit: By",
        "replace": "[.]"
        },
        {
        "find": "New York Times",
        "replace": "[.]"
        },
        {
        "find": "njtowns@nytimes.com",
        "replace": "[.]"
        },
        {
        "find": "N.Y. / Region",
        "replace": "[.]"
        },
        {
        "find": "Published:",
        "replace": "[.]"
        },
        {
        "find": "Room for Debate",
        "replace": "[.]"
        },
        {
        "find": "Special to the New York Times",
        "replace": "[.]"
        },
        {
        "find": "Sunday New Jersy Section",
        "replace": "[.]"
        },
        {
        "find": "620 Eighth Avenue, New York, N.Y. 10018-1405",
        "replace": "[.]"
        },
        {
        "find": "'s",
        "replace": "[.]"
        },
        {
        "find": "ʼs",
        "replace": "[.]"
        }
        ]
    },
    # Iteration 4 -- Extra Processing
    {
    "values": []
    }
]

Comments (0)

You don't have permission to comment on this page.