It seems that your looking for regexp. = P
Here some code I build specialy for you :
import re
def check_th_add_street(address):
# compile regexp rule
has_th_st_nd_rd = re.compile(r"(?P<number>[\d]{1,3}(st|nd|rd|th)\s)(?P<following>.*)")
# first check if the address has number followed by something like 'th, st, nd, rd'
has_number = has_th_st_nd_rd.search(address)
if has_number is not None:
# then check if not followed by 'street'
if re.match('street', has_number.group('following')) is None:
# then add the 'street' word
new_address = re.sub('(?P<number>[\d]{1,3}(st|nd|rd|th)\s)', r'\g<number>street ', address)
return new_address
else:
return True # the format is good (followed by 'street')
else:
return True # there is no number like 'th, st, nd, rd'
I'm python learner so thank you for let me know if it solves your issue.
Tested on a small list of addresses.
Hope it helps or leads you to solution.
Thank you !
EDIT
Improved to take care if followed by "avenue" or "road" as well as "street" :
import re
def check_th_add_street(address):
# compile regexp rule
has_th_st_nd_rd = re.compile(r'(?P<number>[\d]{1,3}(th|st|nd|rd)\s)(?P<following>.*)')
# first check if the address has number followed by something like 'th, st, nd, rd'
has_number = has_th_st_nd_rd.search(address)
if has_number is not None:
# check if followed by "avenue" or "road" or "street"
if re.match(r'(avenue|road|street)', has_number.group('following')):
return True # do nothing
# else add the "street" word
else:
# then add the 'street' word
new_address = re.sub('(?P<number>[\d]{1,3}(st|nd|rd|th)\s)', r'\g<number>street ', address)
return new_address
else:
return True # there is no number like 'th, st, nd, rd'
RE-EDIT
I made some improvement for your needs and added an example of use :
import re
# build the original address list includes bad format
address_list = [
'30 w 60th new york',
'30 w 60th new york',
'30 w 21st new york',
'30 w 23rd new york',
'30 w 1231st new york',
'30 w 1452nd new york',
'30 w 1300th new york',
'30 w 1643rd new york',
'30 w 22nd new york',
'30 w 60th street new york',
'30 w 60th street new york',
'30 w 21st street new york',
'30 w 22nd street new york',
'30 w 23rd street new york',
'30 w brown street new york',
'30 w 1st new york',
'30 w 2nd new york',
'30 w 116th new york',
'30 w 121st avenue new york',
'30 w 121st road new york',
'30 w 123rd road new york',
'30 w 12th avenue new york',
'30 w 151st road new york',
'30 w 15th road new york',
'30 w 16th avenue new york'
]
def check_th_add_street(address):
# compile regexp rule
has_th_st_nd_rd = re.compile(r'(?P<number>[\d]{1,4}(th|st|nd|rd)\s)(?P<following>.*)')
# first check if the address has number followed by something like 'th, st, nd, rd'
has_number = has_th_st_nd_rd.search(address)
if has_number is not None:
# check if followed by "avenue" or "road" or "street"
if re.match(r'(avenue|road|street)', has_number.group('following')):
return address # return original address
# else add the "street" word
else:
new_address = re.sub('(?P<number>[\d]{1,4}(st|nd|rd|th)\s)', r'\g<number>street ', address)
return new_address
else:
return address # there is no number like 'th, st, nd, rd' -> return original address
# initialisation of the new list
new_address_list = []
# built the new clean list
for address in address_list:
new_address_list.append(check_th_add_street(address))
# or you could use it straight here i.e. :
# address = check_th_add_street(address)
# print address
# use the new list to do you work
for address in new_address_list:
print "Formated address is : %s" % address # or what ever you want to do with 'address'
Will output :
Formated address is : 30 w 60th street new york
Formated address is : 30 w 60th street new york
Formated address is : 30 w 21st street new york
Formated address is : 30 w 23rd street new york
Formated address is : 30 w 1231st street new york
Formated address is : 30 w 1452nd street new york
Formated address is : 30 w 1300th street new york
Formated address is : 30 w 1643rd street new york
Formated address is : 30 w 22nd street new york
Formated address is : 30 w 60th street new york
Formated address is : 30 w 60th street new york
Formated address is : 30 w 21st street new york
Formated address is : 30 w 22nd street new york
Formated address is : 30 w 23rd street new york
Formated address is : 30 w brown street new york
Formated address is : 30 w 1st street new york
Formated address is : 30 w 2nd street new york
Formated address is : 30 w 116th street new york
Formated address is : 30 w 121st avenue new york
Formated address is : 30 w 121st road new york
Formated address is : 30 w 123rd road new york
Formated address is : 30 w 12th avenue new york
Formated address is : 30 w 151st road new york
Formated address is : 30 w 15th road new york
Formated address is : 30 w 16th avenue new york
RE-RE-EDIT
The final function : added the count parameter to re.sub()
def check_th_add_street(address):
# compile regexp rule
has_th_st_nd_rd = re.compile(r'(?P<number>[\d]{1,4}(th|st|nd|rd)\s)(?P<following>.*)')
# first check if the address has number followed by something like 'th, st, nd, rd'
has_number = has_th_st_nd_rd.search(address)
if has_number is not None:
# check if followed by "avenue" or "road" or "street"
if re.match(r'(avenue|road|street)', has_number.group('following')):
return address # do nothing
# else add the "street" word
else:
# then add the 'street' word
new_address = re.sub('(?P<number>[\d]{1,4}(st|nd|rd|th)\s)', r'\g<number>street ', address, 1) # the last parameter is the maximum number of pattern occurences to be replaced
return new_address
else:
return address # there is no number like 'th, st, nd, rd'