You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
AQuery/rfdata_preproc.py

45 lines
1.1 KiB

import os
sep = os.sep
# toggles
dataset = 'electricity' # [covtype, electricity, mixed, phishing, power]
use_threadpool = False # True
# environments
input_prefix = f'data{sep}{dataset}_orig'
output_prefix = f'data{sep}{dataset}'
sep_field = b','
sep_subfield = b';'
lst_files = os.listdir(input_prefix)
# lst_files.sort()
try:
os.mkdir(output_prefix)
except FileExistsError:
pass
def process(f : str):
filename = input_prefix + sep + f
ofilename = output_prefix + sep + f[:-3] + 'csv'
with open(filename, 'rb') as ifile:
icontents = ifile.read()
with open(ofilename, 'wb') as ofile:
ofile.write(b'\n')
for l in icontents.splitlines():
fields = l.strip().split(b' ')
subfields = fields[:-1]
ol = ( # fields[0] + sep_field +
sep_subfield.join(subfields) +
sep_field + fields[-1] + b'\n')
ofile.write(ol)
if not use_threadpool:
for f in lst_files:
process(f)
elif __name__ == '__main__':
from multiprocessing import Pool
with Pool(8) as tp:
tp.map(process, lst_files)