文本清理

import re
import json
import os
import unicodedata
import string 
import re  # regular expression

json_o_filename = './output_files/json_file_original'
text_o_filename = './output_files/text_original.txt'
cleaned_text_filename = './output_files/cleaned_tweet_text.txt'
# read tweets from json file
def read_json_file(json_filename, json_file_number):
    json_file = json_filename + '_' + str(json_file_number) +'.txt'
    if os.path.exists(json_file):
        with open(json_file, 'r', encoding="utf-8") as f:
            json_string = f.read()
            parsed = json.loads(json_string)
    return parsed       
//# clearn text functions
def remove_at(text_sentence):
    text_out = re.sub("@\S+",'',text_sentence)
    return text_out

def remove_hashtag(text_sentence):
    text_out = re.sub("#\S+",'',text_sentence)
    return text_out

def remove_url(text_sentence):
    text_out = re.sub("https*\S+",'',text_sentence)
    return text_out

def remove_punctuation(text_sentence):
    text_out = re.sub('[%s]' % re.escape(string.punctuation),'',text_sentence)
    return text_out

def remove_number(text_sentence):
    text_out = re.sub(r'\w*\d+\w*','',text_sentence)
    return text_out

def remove_space(text_sentence):
    text_out = re.sub('\s{2,}','',text_sentence)
    text_out = text_out.strip()
    return text_out

def remove_others(text_sentence):
    text_out = text_sentence.replace('\r', '')     ## 回车符  win: \r\n
    text_out = text_sentence.replace('\r\n', '')     ## 回车符  win: \r\n
    text_out = text_sentence.replace('\t', ' ')    ## 水平制表符
    text_out = text_sentence.replace('\f', ' ')    ## 换页符
    return text_out

def remove_unicode(text_sentence):
    text_out = text_sentence.encode('ascii', 'ignore').decode() 
    return text_out

def join_multi_line(text_sentence):
    text_out = ''
    for line in text_sentence:
        text_out += line.strip('\n')
    return text_out

Last updated

Was this helpful?