# 文本清理

```
import re
import json
import os
import unicodedata
import string 
import re  # regular expression

json_o_filename = './output_files/json_file_original'
text_o_filename = './output_files/text_original.txt'
cleaned_text_filename = './output_files/cleaned_tweet_text.txt'

```

```
# read tweets from json file
def read_json_file(json_filename, json_file_number):
    json_file = json_filename + '_' + str(json_file_number) +'.txt'
    if os.path.exists(json_file):
        with open(json_file, 'r', encoding="utf-8") as f:
            json_string = f.read()
            parsed = json.loads(json_string)
    return parsed       
```

```
//# clearn text functions
def remove_at(text_sentence):
    text_out = re.sub("@\S+",'',text_sentence)
    return text_out

def remove_hashtag(text_sentence):
    text_out = re.sub("#\S+",'',text_sentence)
    return text_out

def remove_url(text_sentence):
    text_out = re.sub("https*\S+",'',text_sentence)
    return text_out

def remove_punctuation(text_sentence):
    text_out = re.sub('[%s]' % re.escape(string.punctuation),'',text_sentence)
    return text_out

def remove_number(text_sentence):
    text_out = re.sub(r'\w*\d+\w*','',text_sentence)
    return text_out

def remove_space(text_sentence):
    text_out = re.sub('\s{2,}','',text_sentence)
    text_out = text_out.strip()
    return text_out

def remove_others(text_sentence):
    text_out = text_sentence.replace('\r', '')     ## 回车符  win: \r\n
    text_out = text_sentence.replace('\r\n', '')     ## 回车符  win: \r\n
    text_out = text_sentence.replace('\t', ' ')    ## 水平制表符
    text_out = text_sentence.replace('\f', ' ')    ## 换页符
    return text_out

def remove_unicode(text_sentence):
    text_out = text_sentence.encode('ascii', 'ignore').decode() 
    return text_out

def join_multi_line(text_sentence):
    text_out = ''
    for line in text_sentence:
        text_out += line.strip('\n')
    return text_out
```

```
def clear_text(text):
    print('--------------- begin to clean ---------------\n')
    
    print('\n1- remove_at\n')   
    text = remove_at(text)
    print(text)
        
    print('\n2- remove_hashtag\n')
    text = remove_hashtag(text)
    print(text)
    
    print('\n3- remove_url\n')
    text = remove_url(text)
    print(text)
    
    print('\n4- remove_punctuation\n')
    text = remove_punctuation(text)
    print(text)
    
    print('\n5- remove_number\n')
    text = remove_number(text)
    print(text)
    
    print('\n6- remove_space\n')
    text = remove_space(text)
    print(text)
    
    print('\n7- remove_unicode\n')
    text = remove_unicode(text)
    print(text)
    
    print('\n8- remove_others\n')
    text = remove_others(text)
    print(text)
    
    print('\n9- join_multi_line\n')
    text = join_multi_line(text)
    print(text)
    
    print('\n--------------- end clean ----------------\n')
 
```

```
def save_o_text(filename, json_parsed, option):
    with open(filename, option, encoding="utf-8") as f:
        title = 'number | created time | text \n'
        f.write(title)
        number = 1
        for data in json_parsed:
            create_time = data['created_at']
            tweet_text = remove_space(data['text'])
            text = str(number) + ' | ' +  create_time[:10] + ' | ' + tweet_text +'\n'
            f.write(text)
            number = number + 1
            print(text)
            cleaned_text = clear_text(text)
            f.write('\n'+cleaned_text)
            
```

```
# start processing
json_file_number = 1
json_parsed = read_json_file(json_o_filename, json_file_number)
# print(json.dumps(json_parsed, indent=4, sort_keys=True))

save_o_text(text_o_filename, json_parsed['data'], 'w')
```

>


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://ai-candy2023.gitbook.io/introduction/artificial-intelligence/python/wen-ben-qing-li.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
