文本清理
import re
import json
import os
import unicodedata
import string
import re # regular expression
json_o_filename = './output_files/json_file_original'
text_o_filename = './output_files/text_original.txt'
cleaned_text_filename = './output_files/cleaned_tweet_text.txt'
# read tweets from json file
def read_json_file(json_filename, json_file_number):
json_file = json_filename + '_' + str(json_file_number) +'.txt'
if os.path.exists(json_file):
with open(json_file, 'r', encoding="utf-8") as f:
json_string = f.read()
parsed = json.loads(json_string)
return parsed //# clearn text functions
def remove_at(text_sentence):
text_out = re.sub("@\S+",'',text_sentence)
return text_out
def remove_hashtag(text_sentence):
text_out = re.sub("#\S+",'',text_sentence)
return text_out
def remove_url(text_sentence):
text_out = re.sub("https*\S+",'',text_sentence)
return text_out
def remove_punctuation(text_sentence):
text_out = re.sub('[%s]' % re.escape(string.punctuation),'',text_sentence)
return text_out
def remove_number(text_sentence):
text_out = re.sub(r'\w*\d+\w*','',text_sentence)
return text_out
def remove_space(text_sentence):
text_out = re.sub('\s{2,}','',text_sentence)
text_out = text_out.strip()
return text_out
def remove_others(text_sentence):
text_out = text_sentence.replace('\r', '') ## 回车符 win: \r\n
text_out = text_sentence.replace('\r\n', '') ## 回车符 win: \r\n
text_out = text_sentence.replace('\t', ' ') ## 水平制表符
text_out = text_sentence.replace('\f', ' ') ## 换页符
return text_out
def remove_unicode(text_sentence):
text_out = text_sentence.encode('ascii', 'ignore').decode()
return text_out
def join_multi_line(text_sentence):
text_out = ''
for line in text_sentence:
text_out += line.strip('\n')
return text_outLast updated
Was this helpful?