文本清理
import re
import json
import os
import unicodedata
import string
import re # regular expression
json_o_filename = './output_files/json_file_original'
text_o_filename = './output_files/text_original.txt'
cleaned_text_filename = './output_files/cleaned_tweet_text.txt'
# read tweets from json file
def read_json_file(json_filename, json_file_number):
json_file = json_filename + '_' + str(json_file_number) +'.txt'
if os.path.exists(json_file):
with open(json_file, 'r', encoding="utf-8") as f:
json_string = f.read()
parsed = json.loads(json_string)
return parsed
//# clearn text functions
def remove_at(text_sentence):
text_out = re.sub("@\S+",'',text_sentence)
return text_out
def remove_hashtag(text_sentence):
text_out = re.sub("#\S+",'',text_sentence)
return text_out
def remove_url(text_sentence):
text_out = re.sub("https*\S+",'',text_sentence)
return text_out
def remove_punctuation(text_sentence):
text_out = re.sub('[%s]' % re.escape(string.punctuation),'',text_sentence)
return text_out
def remove_number(text_sentence):
text_out = re.sub(r'\w*\d+\w*','',text_sentence)
return text_out
def remove_space(text_sentence):
text_out = re.sub('\s{2,}','',text_sentence)
text_out = text_out.strip()
return text_out
def remove_others(text_sentence):
text_out = text_sentence.replace('\r', '') ## 回车符 win: \r\n
text_out = text_sentence.replace('\r\n', '') ## 回车符 win: \r\n
text_out = text_sentence.replace('\t', ' ') ## 水平制表符
text_out = text_sentence.replace('\f', ' ') ## 换页符
return text_out
def remove_unicode(text_sentence):
text_out = text_sentence.encode('ascii', 'ignore').decode()
return text_out
def join_multi_line(text_sentence):
text_out = ''
for line in text_sentence:
text_out += line.strip('\n')
return text_out
def clear_text(text):
print('--------------- begin to clean ---------------\n')
print('\n1- remove_at\n')
text = remove_at(text)
print(text)
print('\n2- remove_hashtag\n')
text = remove_hashtag(text)
print(text)
print('\n3- remove_url\n')
text = remove_url(text)
print(text)
print('\n4- remove_punctuation\n')
text = remove_punctuation(text)
print(text)
print('\n5- remove_number\n')
text = remove_number(text)
print(text)
print('\n6- remove_space\n')
text = remove_space(text)
print(text)
print('\n7- remove_unicode\n')
text = remove_unicode(text)
print(text)
print('\n8- remove_others\n')
text = remove_others(text)
print(text)
print('\n9- join_multi_line\n')
text = join_multi_line(text)
print(text)
print('\n--------------- end clean ----------------\n')
def save_o_text(filename, json_parsed, option):
with open(filename, option, encoding="utf-8") as f:
title = 'number | created time | text \n'
f.write(title)
number = 1
for data in json_parsed:
create_time = data['created_at']
tweet_text = remove_space(data['text'])
text = str(number) + ' | ' + create_time[:10] + ' | ' + tweet_text +'\n'
f.write(text)
number = number + 1
print(text)
cleaned_text = clear_text(text)
f.write('\n'+cleaned_text)
# start processing
json_file_number = 1
json_parsed = read_json_file(json_o_filename, json_file_number)
# print(json.dumps(json_parsed, indent=4, sort_keys=True))
save_o_text(text_o_filename, json_parsed['data'], 'w')
Last updated