-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathData_sorting.py
241 lines (211 loc) · 9.31 KB
/
Data_sorting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""This class contains calculation for sorting data in order to make the plotting scripts lighter
@author Boris Gailleton"""
import numpy as np
import pandas as bamboo_bears
from pyproj import Proj, transform
def source_num_of_longest_river(arr):
"""
This function will return the source number of the longuest channel by counting the number of occurence of each sources number.
It may be an issue if your channel have similar size, I will create a fancier function that calculate the actual lenght of each channel if required.
"""
counount = 0
max_count = -999
max_count_index = -999
for i in range(1,arr.shape[0]):
if(arr[i] == arr[i-1]):
counount += 1
else:
if (counount > max_count):
max_count_index = arr[i-1]
max_count = counount
counount = 0
return max_count_index
def crop_point_data_to_base_raster(raster_name, raster_directory, csv_file, EPSG_code = 0):
"""
This function create a new csv file cropped to the base raster. It can lower the processing time if your point data is on a significantly larger area than the base raster.
"""
print("ok let me load your dataset and your hdr file")
# Read the file
df = bamboo_bears.read_csv(csv_file)
# Read and sort the csv_info
with open(raster_directory+raster_name+".hdr","r") as hdr_file:
print("I got these")
for line in hdr_file:
if(line[0:8] == "map info"):
info = line[12:-2]
info = info.split(",")
x_min = float(info[3])
y_max = float(info[4])
x_res = float(info[5])
y_res = float(info[6])
utm_zone = int(info[7])
utm_hemisphere = info[8]
else:
if(line[0:7] == "samples"):
num_col = line.replace(" ","").split("=")[1]
print("there are " + str(num_col) + " columns")
num_col = int(num_col)
else:
if(line[0:5] == "lines"):
num_lines = line.replace(" ","").split("=")[1]
print("there are " + str(num_lines) + " lines")
num_lines = int(num_lines)
# Now I calculate the size of the dem
x_max = x_min + x_res*num_col
y_min = y_max - y_res*num_lines
# Conversion UTM to lat/long
inProj = Proj(init='epsg:'+str(EPSG_code))
outProj = Proj(init='epsg:4326')
long_min,lat_min = transform(inProj,outProj,x_min,y_min)
long_max,lat_max = transform(inProj,outProj,x_max,y_max)
# data sorting
df = df[df.longitude<long_max]
df = df[df.latitude<lat_max]
df = df[df.latitude>lat_min]
df = df[df.longitude>long_min]
df.to_csv(csv_file[:-4]+"_"+raster_name+"_filtered.csv", index = False)
#return the name of the new csv file
return csv_file[:-4]+"_"+raster_name+"_filtered.csv"
def select_data_from_one_col(csv_file, col_name = "None",values =[], wanted = True):
"""
This function selects specific data from a csv file using pandas (the package, not the majestic animal).
It requires the name+path of csv, the name of the column, a python list containing the wanted values and the wanted parameter (True to select the data, False to select the opposite)
"""
# Ignore this
word = "excluding"
if(wanted):
word = "including"
###########
print("I am going to sort your data " + col_name +" by " + word + " the following value(s):")
print(values)
# import the file
df = bamboo_bears.read_csv(csv_file,sep = ",")
# Select the data
if(wanted):
retArray = df[df[col_name].isin(values)]
else:
if(wanted == False):
retArray = df[~df[col_name].isin(values)]
retArray.to_csv(csv_file[:-4]+"_SDFOC.csv", index = False)
print("Your new file is ready and is there: " + str(csv_file[:-4]+"_SDFOC.csv"))
return csv_file[:-4]+"_SDFOC.csv"
def quick_load_bil_raster(raster_name, raster_path):
"""
This function return a numpy 2Darray from a bil-ENVI raster. It uses numpy.fromfile(), quite efficient.
"""
# Read the number of raw/col to recast the array
with open(raster_path+raster_name+".hdr","r") as hdr_file:
print("I am opening your raster info")
for line in hdr_file:
#testing the data type
if(line[0:9] == "data type"):
info_dtype = line[-2:]
info_dtype = int(info_dtype)
else:
if(line[0:8] == "map info"):
info = line[12:-2]
info = info.split(",")
x_min = float(info[3])
y_max = float(info[4])
x_res = int(info[5])
y_res = int(info[6])
utm_zone = int(info[7])
utm_hemisphere = info[8]
else:
if(line[0:7] == "samples"):
num_col = line.replace(" ","").split("=")[1]
print("there are " + str(num_col) + " columns")
num_col = int(num_col)
else:
if(line[0:5] == "lines"):
num_lines = line.replace(" ","").split("=")[1]
print("there are " + str(num_lines) + " lines")
num_lines = int(num_lines)
#The type of data representation:
#1 = Byte: 8-bit unsigned integer
#2 = Integer: 16-bit signed integer
#3 = Long: 32-bit signed integer
#4 = Floating-point: 32-bit single-precision
#5 = Double-precision: 64-bit double-precision floating-point
#6 = Complex: Real-imaginary pair of single-precision floating-point
#9 = Double-precision complex: Real-imaginary pair of double precision floating-point
#12 = Unsigned integer: 16-bit
#13 = Unsigned long integer: 32-bit
#14 = 64-bit long integer (signed)
#15 = 64-bit unsigned long integer (unsigned)
if(info_dtype == 1):
data_type = np.dtype('uint8')
else:
if(info_dtype == 2):
data_type = np.dtype('int16')
else:
if(info_dtype == 3):
data_type = np.dtype('int32')
else:
if(info_dtype == 4):
data_type = np.dtype('Float32')
else:
if(info_dtype == 5):
data_type = np.dtype('Float64')
else:
if(info_dtype == 12):
data_type = np.dtype('uint16')
print("your data type is " + str(data_type))
# Alright now loading and converting the data
print("I am now ingesting your raster")
raster_data = np.fromfile(raster_path+raster_name+".bil",data_type).reshape(num_lines,num_col)
print("I nailed it")
x_max = x_min + x_res*num_col
y_min = y_max - y_res*num_lines
print("I am returning the raster array and info")
return raster_data,x_min,x_max,y_min,y_max,x_res,y_res
def get_basin_middle(df,wdir = "", key = "basin_key", keyDA = "drainage area"):
"""
Create a dataframe containing the lat/long of the center of a basin and the key of this one. Usefull to visualize basin key when QGIS is not working...
Default values adapted for output from chi_mapping tool
Author: Boris
"""
dfbasin = []
lst_basins = df[key].unique()
nb_basin = lst_basins.shape[0]
for bas in lst_basins:
print("I am processing basin %s/%s " %(bas, nb_basin))
temp = []
long_max = df["longitude"][df[key] == bas].max()
long_min = df["longitude"][df[key] == bas].min()
lat_max = df["latitude"][df[key] == bas].max()
lat_min = df["latitude"][df[key] == bas].min()
DA = df[keyDA][df[key] == bas].max()
BK = bas
temp = [(lat_min+lat_max)/2,(long_min+long_max)/2,BK,DA]
dfbasin.append(temp)
header = ["latitude", "longitude", "basin_key", "drainage area"]
print("I am now creating a dataframe")
dfbasin = bamboo_bears.DataFrame(dfbasin,columns = header)
print("I am saving a csv file f you want to reload directly this data as %sBASIN_LOCA.csv" %(wdir))
dfbasin.to_csv(wdir+"BASIN_LOCA.csv", index = False)
print(dfbasin)
print("done with the basin location")
return dfbasin
def pandas_to_unique_df(idf,columns = 'source_ID', otype = 'list'):
"""
Take a dataframe, return a list or dictionary of dataframes from the uniques values of one of its colums.
Can be useful to plot basins_river more easily, I already needed it in some cases, I don't really remember why!
But now it is coded anyway.
@args:
idf (pandas dataframe): The input dataframe
columns (str): the name of the column containing the unique values
type (str): Do you want a list or a dict (the key would be the unique value)
@return:
A list of dataframes
"""
if otype == 'list':
odf = [] # the Output list
for i in idf.unique(): # looping through the unique values of the dataframe
odf.append(idf[idf[columns] == i]) # appending a single dataframe
else:
odf = {}
for i in idf.unique():
odf[i] = idf[idf[columns] == i]
return odf
#