Source code for bardi.nlp_engineering.regex_library.pathology_report

"""Curated set of regular expressions for cleaning text from pathology reports."""

from typing import List

from bardi.nlp_engineering.regex_library import regex_lib
from bardi.nlp_engineering.regex_library.regex_set import RegexSet, RegexSubPair


[docs] class PathologyReportRegexSet(RegexSet): """The PathologyReportRegexSet includes set of standard regular expression to normalize a pathology report. Note ---- The set of regular expressions tailored for pathology reports was crafted with the understanding that dividing text based on punctuation often results in the loss of crucial information. E.g. terms like "her-2" should not be split. However, to ensure that the number of unique tokens remains manageable we employ a number of regular expression to separate some tokens around punctuation. E.g. :code:`22-years` becomes :code:`22 years`. This consideration is particularly important when employing the word2vec algorithm, as an excessive number of tokens can impede the model's effectiveness by diluting the representation of key concepts. Attributes ---------- convert_escape_codes: bool Removes escape codes such as \\x0d, \\x0a, etc. handle_whitespaces: bool Removes extra whitespaces: any new line, carriage return tab. remove_urls: bool Removes URLs found in the text that match the pattern. remove_special_punct: bool Removes special punctuation like (?,$). remove_multiple_punct: bool Removes duplicated punctuation. E.g. :code:`---` handle_angle_brackets: bool Removes angle brackets. E.g. :code:`<title>` becomes :code:`title`. replace_percent_sign: bool Replaces a percent sign with a 'percent' word. handle_leading_digit_punct: bool Removes punctuation when digit is attached to word. E.g. :code:`22-years` becomes :code:`22 years`. remove_leading_punct: bool Removes leading punctuation from words. E.g. :code:`-result` becomes :code:`result`. remove_trailing_punct: bool Removes trailing punctuation from words. E.g. :code:`result-` becomes :code:`result`. handle_words_with_punct_spacing: bool Matches words with hyphen, colon or period and splits them. handle_math_spacing: bool Matches "math operators symbols" like ><=%: and adds spaces aroud them. handle_dimension_spacing: bool Matches digits and x and adds spaces between them. handle_measure_spacing: bool Matches measurements in mm, cm and ml provides proper spacing between the digits and measure. handle_cassettes_spacing: bool Matches patterns like 5e-6f and adds spaces around them. handle_dash_digit_spacing: bool Matches dashes around digits and adds spaces around the dashes. handle_literals_floats_spacing: bool Matches character followed by a float and a word. This is a common formating problem. E.g. :code:`r18.0admission` becomes :code:`r18.0 admission`. fix_pluralization: bool Matches s character after a word and attaches it back to the word. This restores plural nouns demages by removed punctuation. handle_digits_words_spacing: bool Matches digits that are attached to the beginning of a word. remove_phone_numbers: bool Matches any phone number that consists of 10 digits with delimeters. remove_dates: bool Removes dates of prespecified format. remove_times: bool Matches time of format 11:20 am or 1.30pm or 9:52:07AM. remove_addresses: bool Matches any address of format num (street name) in 1 to 6 words 2-letter state and short or long zip code. remove_dimensions: bool Matches 2D or 3D dimension measurements and adds spaces around them. remove_specimen: bool Matches marking of a pathology speciman. remove_decimal_seg_numbers: bool Matches combinations of digits and periods or dashes. E.g. :code:` 1.78.9.87`. remove_large_digits_seq: bool Matches large sequences of digits (3 or more) and replaces it. remove_large_floats_seq: bool Matches large floats and replace them. trunc_decimals: bool = True Matches floats and keeps only first decimal. remove_cassette_names: bool Removes pathology samples' markings. E.g. :code:`1-e`. remove_duration_time: bool Removes duration a speciment was treated. E.g. :code:`32d09090301`. remove_letter_num_seq: bool Removes a character followed directly by 6 to 10 digits. """ def __init__( self, convert_escape_codes: bool = True, # 0 handle_whitespaces: bool = True, # 1 remove_urls: bool = True, # 2 remove_special_punct: bool = True, # 3 remove_multiple_punct: bool = True, # 4 handle_angle_brackets: bool = True, # 5 replace_percent_sign: bool = True, # 6 handle_leading_digit_punct: bool = True, # 7 remove_leading_punct: bool = True, # 8 remove_trailing_punct: bool = True, # 9 handle_words_with_punct_spacing: bool = True, # 10 handle_math_spacing: bool = True, # 11 handle_dimension_spacing: bool = True, # 12 handle_measure_spacing: bool = True, # 13 handle_cassettes_spacing: bool = True, # 14 handle_dash_digit_spacing: bool = True, # 15 handle_literals_floats_spacing: bool = True, # 16 fix_pluralization: bool = True, # 17 handle_digits_words_spacing: bool = True, # 18 remove_phone_numbers: bool = True, # 19 remove_dates: bool = True, # 20 remove_times: bool = True, # 21 remove_addresses: bool = True, # 22 remove_dimensions: bool = True, # 23 remove_specimen: bool = True, # 24 remove_decimal_seg_numbers: bool = True, # 25 remove_large_digits_seq: bool = True, # 26 remove_large_floats_seq: bool = True, # 27 trunc_decimals: bool = True, # 28 remove_cassette_names: bool = True, # 29 remove_duration_time: bool = True, # 30 remove_letter_num_seq: bool = True, ): # === List of regex sub pairs === self.regex_set: List[RegexSubPair] = [] # === Retrieve regular expression substitution pairs from regex_lib # 0 removes escapes codes if convert_escape_codes: self.regex_sub_escape_codes = regex_lib.get_escape_code_regex() self.regex_set.append(self.regex_sub_escape_codes) # 1 (regex, sub_str) for any new line, carriage return tab # and multiple spaces --> " " if handle_whitespaces: self.regex_sub_whitespaces = regex_lib.get_whitespace_regex() self.regex_set.append(self.regex_sub_whitespaces) # 2 removes URLs that start with https http or www if remove_urls: self.regex_sub_urls = regex_lib.get_urls_regex() self.regex_set.append(self.regex_sub_urls) # 3 matches a set of special punctuation # ,();[]#{}* --> " " if remove_special_punct: self.regex_sub_special_punct = regex_lib.get_special_punct_regex() self.regex_set.append(self.regex_sub_special_punct) # 4 matches multiple occurences of symbols like -, .and _ if remove_multiple_punct: self.regex_sub_multiple_punct = regex_lib.get_multiple_punct_regex() self.regex_set.append(self.regex_sub_multiple_punct) # 5(regex, sub_str) removes angle brackets # <THIS IS INSIDE> --> THIS IS INSIDE if handle_angle_brackets: self.regex_sub_angle_brackets = regex_lib.get_angle_brackets_regex() self.regex_set.append(self.regex_sub_angle_brackets) # 6 (regex, sub_str) replaces % for a percent word 56% --> 56 PERCENT if replace_percent_sign: self.regex_sub_percent_sign = regex_lib.get_percent_sign_regex() self.regex_set.append(self.regex_sub_percent_sign) # 7 if handle_leading_digit_punct: self.regex_sub_leading_digit_punct = ( regex_lib.get_leading_digit_punctuation_regex() ) self.regex_set.append(self.regex_sub_leading_digit_punct) # 8 if remove_leading_punct: self.regex_sub_leading_punct = regex_lib.get_leading_punctuation_regex() self.regex_set.append(self.regex_sub_leading_punct) # 9 if remove_trailing_punct: self.regex_sub_trailing_punct = regex_lib.get_trailing_punctuation_regex() self.regex_set.append(self.regex_sub_trailing_punct) # 10 if handle_words_with_punct_spacing: self.regex_sub_words_with_punct_spacing = ( regex_lib.get_words_with_punct_spacing_regex() ) self.regex_set.append(self.regex_sub_words_with_punct_spacing) # 11 if handle_math_spacing: self.regex_sub_math_spacing = regex_lib.get_math_spacing_regex() self.regex_set.append(self.regex_sub_math_spacing) # 12 if handle_dimension_spacing: self.regex_sub_dimension_spacing = regex_lib.get_dimension_spacing_regex() self.regex_set.append(self.regex_sub_dimension_spacing) # 13 if handle_measure_spacing: self.regex_sub_measure_spacing = regex_lib.get_measure_spacing_regex() self.regex_set.append(self.regex_sub_measure_spacing) # 14 if handle_cassettes_spacing: self.regex_sub_cassette_spacing = regex_lib.get_cassettes_spacing_regex() self.regex_set.append(self.regex_sub_cassette_spacing) # 15 if handle_dash_digit_spacing: self.regex_sub_dash_spacing = regex_lib.get_dash_digits_spacing_regex() self.regex_set.append(self.regex_sub_dash_spacing) # 16 if handle_literals_floats_spacing: self.regex_sub_literals_floats_spacing = ( regex_lib.get_literals_floats_spacing_regex() ) self.regex_set.append(self.regex_sub_literals_floats_spacing) # 17 if fix_pluralization: self.regex_sub_fix_pluralization = regex_lib.get_fix_pluralization_regex() self.regex_set.append(self.regex_sub_fix_pluralization) # 18 if handle_digits_words_spacing: self.regex_sub_digits_words_spacing = ( regex_lib.get_digits_words_spacing_regex() ) self.regex_set.append(self.regex_sub_digits_words_spacing) # 19 remove phone numbers if remove_phone_numbers: self.regex_sub_phone_numbers = regex_lib.get_phone_number_regex() self.regex_set.append(self.regex_sub_phone_numbers) # 20 remove dates and times if remove_dates: self.regex_sub_dates = regex_lib.get_dates_regex() self.regex_set.append(self.regex_sub_dates) # 21 remove time if remove_times: self.regex_sub_time = regex_lib.get_time_regex() self.regex_set.append(self.regex_sub_time) # 22 remove addresses if remove_addresses: self.regex_sub_address = regex_lib.get_address_regex() self.regex_set.append(self.regex_sub_address) # 23 dimension substitution if remove_dimensions: self.regex_sub_dimensions = regex_lib.get_dimensions_regex() self.regex_set.append(self.regex_sub_dimensions) # 24 specimen if remove_specimen: self.regex_sub_specimen = regex_lib.get_specimen_regex() self.regex_set.append(self.regex_sub_specimen) # 25 if remove_decimal_seg_numbers: self.regex_sub_decimal_seg_numbers = ( regex_lib.get_decimal_segmented_numbers_regex() ) self.regex_set.append(self.regex_sub_decimal_seg_numbers) # 26 if remove_large_digits_seq: self.regex_sub_large_digits_seq = regex_lib.get_large_digits_seq_regex() self.regex_set.append(self.regex_sub_large_digits_seq) # 27 if remove_large_floats_seq: self.regex_sub_large_floats_seq = regex_lib.get_large_float_seq_regex() self.regex_set.append(self.regex_sub_large_floats_seq) # 28 if trunc_decimals: self.regex_sub_trunc_decimals = regex_lib.get_trunc_decimals_regex() self.regex_set.append(self.regex_sub_trunc_decimals) # 29 if remove_cassette_names: self.regex_sub_cassette_names = regex_lib.get_cassette_name_regex() self.regex_set.append(self.regex_sub_cassette_names) # 30 if remove_duration_time: self.regex_sub_duration_time = regex_lib.get_duration_regex() self.regex_set.append(self.regex_sub_duration_time) # 31 if remove_letter_num_seq: self.regex_sub_letter_num_seq = regex_lib.get_letter_num_seq_regex() self.regex_set.append(self.regex_sub_letter_num_seq) # LAST condense spacing - not configurable, always executes self.regex_sub_spaces = regex_lib.get_spaces_regex() self.regex_set.append(self.regex_sub_spaces)