前言
这时候就需要pandas出动了
我们需要看看txt的具体结构
In [1]:
line_number = 1
with open('/home/mw/project/test.txt', 'r') as f:
line = f.readline()
while line:
print(f"Line {line_number}: {line}")
line = f.readline()
line_number += 1
Line 1:
Line 2: <HTML>
Line 3: <HEAD>
Line 4: <TITLE>Wyoming Weather Web</TITLE>
Line 5:
Line 6: <link rel="stylesheet" href="/resources/weather.css" type="text/css">
Line 7:
Line 8: </HEAD>
Line 9:
Line 10: <BODY>
Line 11: <link rel="stylesheet" href="/resources/weather.css" type="text/css">
Line 12: <div id="masthead">
Line 13: <table style="min-width:1005px; max-width:2400px;" width="100%" border="0"
Line 14: cellpadding="0" cellspacing="0">
Line 15: <tr>
Line 16: <td><div id="logo-container">
Line 17: <img lay-src="/images/uwlogo.png" border="0"></div></td>
Line 18: <td valign="top" align="right">
Line 19: <table>
Line 20: <tr>
Line 21: <td valign="top" align="right"><div id="wywx"><a href="/index.shtml">
Line 22: Wyoming Weather Web</A></div></td>
Line 23: </tr>
Line 24: <tr>
Line 25: <td valign="top" align="right" width="600">
Line 26: <div id="depts">
Line 27: <a href="http://www.uwyo.edu/atsc">Atmospheric Science</A> |
Line 28: <a href="http://www.uwyo.edu/ceas">Engineering and Applied Science</A> |
Line 29: <a href="http://www.uwyo.edu/">UWyo Home</A>
Line 30: </div>
Line 31: </td>
Line 32: </tr>
Line 33: </table>
Line 34: </td>
Line 35: </tr>
Line 36: </table>
Line 37: </div>
Line 38:
Line 39: <!-- left menu -->
Line 40: <div id="leftmenu">
Line 41: <P/>
Line 42: <P/>
Line 43: <UL>
Line 44: <LI><a href="/wyoming/">Wyoming</a></LI>
Line 45: <LI><a href="/cities/">US Cities</a></LI>
Line 46: <LI><a href="/surface/">Surface</a></LI>
Line 47: <LI><a href="/upperair/">Upper Air</a></LI>
Line 48: <UL>
Line 49: <LI><A HREF="/upperair/sounding.html" TARGET="_top">TEMP Soundings</A></LI>
Line 50: <LI><A HREF="/upperair/bufrraob.shtml" TARGET="_top">BUFR Soundings</A></LI>
Line 51: </UL>
Line 52: <LI><a href="/models/fcst/">Models</a></LI>
Line 53: </UL>
Line 54: <p>Other data for station: 54511</p>
Line 55: <ul>
Line 56: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-17 12:00:00&id=54511&type=TEXT:LIST">1200Z Jan 17</a></li>
Line 57: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 12:00:00&id=54511&type=TEXT:LIST">1200Z Jan 18</a></li>
Line 58: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 00:00:00&id=54511&type=TEXT:CSV">Comma Separated Values</a></li>
Line 59: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 00:00:00&id=54511&type=PNG:SKEWT">Skew-T PNG image</a></li>
Line 60: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 00:00:00&id=54511&type=PNG:STUVE">Stuve PNG image</a></li>
Line 61: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 00:00:00&id=54511&type=PNG:STUVE10">Stuve PNG image to 10 hPa</a></li>
Line 62: <li><a href="/cgi-bin/bufrraob.py?datetime=2022-01-18 00:00:00&id=54511&type=INVENTORY">Inventory for year</a></li>
Line 63: </ul>
Line 64: </div>
Line 65:
Line 66: <!-- main menu -->
Line 67: <div id="maincolumn">
Line 68: <H1>Observations for Station 54511 starting 2315Z 17 Jan 2022</H1>
Line 69: <H3>BEIJING, CHINA</H3>
Line 70: <BR/><I>Latitude: 39.930 Longitude: 116.280</I>
Line 71: <PRE>
Line 72: -----------------------------------------------------------------------------
Line 73: PRES HGHT TEMP DWPT RELH MIXR DRCT SPED THTA THTE THTV
Line 74: hPa m C C % g/kg deg m/s K K K
Line 75: -----------------------------------------------------------------------------
Line 76: 1022.5 34 -8.3 -13.9 64 1.28 33 1.9 263.2 266.7 263.4
Line 77: 1019.9 52 -6.4 -12.9 60 1.39 42 1.7 265.3 269.1 265.5
Line 78: 1013.4 101 -4.7 -12.2 56 1.48 67 1.4 267.4 271.6 267.7
Line 79: 1005.4 164 -4.4 -13.2 50 1.38 98 0.9 268.3 272.2 268.6
Line 80: 1004.6 169 -4.4 -13.2 50 1.38 100 0.9 268.4 272.3 268.6
Line 81: 1000.0 209 -4.3 -12.9 51 1.42 118 1.2 268.9 272.9 269.1
Line 82: 999.0 214 -4.2 -12.7 52 1.44 120 1.3 269.0 273.1 269.3
Line 83: 976.7 394 -4.5 -13.5 49 1.38 199 2.8 270.5 274.4 270.7
Line 84: 970.5 446 -4.0 -13.5 48 1.39 221 3.3 271.5 275.4 271.7
Line 85: 955.4 570 -3.3 -12.8 48 1.50 224 5.0 273.4 277.7 273.6
Line 86: 936.9 723 -3.8 -13.4 47 1.45 227 7.1 274.4 278.6 274.7
Line 87: 925.0 824 -4.2 -13.8 47 1.42 231 7.7 275.0 279.1 275.2
Line 88: 910.1 961 -4.6 -13.8 49 1.45 236 8.5 275.9 280.1 276.1
Line 89: 905.4 1004 -4.7 -14.4 47 1.39 238 8.7 276.2 280.2 276.4
Line 90: 896.5 1084 -4.9 -16.0 41 1.23 244 8.1 276.8 280.4 277.0
Line 91: 889.1 1150 -4.3 -15.5 41 1.29 248 7.6 278.0 281.9 278.3
Line 92: 878.4 1240 -5.1 -16.4 41 1.21 255 6.9 278.2 281.8 278.4
Line 93: 868.1 1335 -4.8 -18.9 32 0.99 261 6.2 279.4 282.4 279.6
Line 94: 852.9 1463 -5.8 -21.0 29 0.84 268 5.2 279.8 282.4 279.9
Line 95: 850.0 1489 -5.7 -21.2 28 0.83 269 5.0 280.2 282.7 280.3
Line 96: 842.1 1557 -5.8 -21.8 27 0.80 273 4.5 280.8 283.3 280.9
Line 97: 839.9 1577 -5.9 -22.2 26 0.77 274 4.3 280.9 283.3 281.0
Line 98: 815.3 1808 -6.7 -24.4 23 0.65 295 5.0 282.5 284.5 282.6
Line 99: 807.7 1879 -7.3 -25.1 23 0.62 302 5.2 282.6 284.5 282.7
Line 100: 790.9 2034 -8.7 -26.6 22 0.55 290 5.7 282.8 284.5 282.9
Line 101: 778.8 2155 -9.0 -27.6 21 0.51 281 6.2 283.7 285.3 283.8
Line 102: 776.1 2180 -9.3 -28.2 20 0.48 279 6.3 283.7 285.2 283.8
Line 103: 749.5 2456 -11.5 -30.4 19 0.41 268 8.6 284.1 285.4 284.2
Line 104: 747.2 2482 -11.4 -30.5 19 0.41 267 8.8 284.5 285.8 284.5
Line 105: 745.1 2507 -10.9 -30.5 18 0.41 269 9.1 285.2 286.6 285.3
Line 106: 737.9 2587 -10.7 -31.7 16 0.37 275 10.1 286.3 287.5 286.3
Line 107: 719.7 2784 -11.1 -33.6 14 0.31 290 12.5 287.9 288.9 287.9
Line 108: 716.5 2818 -11.2 -33.2 14 0.33 291 12.8 288.1 289.2 288.2
Line 109: 704.5 2944 -12.0 -33.9 14 0.31 293 13.5 288.6 289.7 288.7
Line 110: 700.0 2989 -11.7 -33.4 15 0.33 293 13.8 289.5 290.6 289.6
Line 111: 691.4 3088 -10.6 -32.9 14 0.35 295 14.4 291.7 292.9 291.8
Line 112: 691.0 3094 -10.6 -32.9 14 0.35 295 14.4 291.8 293.0 291.9
Line 113: 671.4 3317 -11.5 -33.8 14 0.33 296 15.3 293.2 294.3 293.3
Line 114: 634.6 3749 -14.3 -37.4 12 0.24 294 16.9 294.8 295.6 294.8
Line 115: 633.9 3760 -14.3 -37.4 12 0.24 294 16.9 294.9 295.7 294.9
Line 116: 625.7 3858 -14.4 -38.1 11 0.23 297 17.1 295.8 296.6 295.9
Line 117: 607.4 4077 -16.3 -39.1 12 0.21 304 17.6 296.2 296.9 296.2
Line 118: 600.0 4162 -16.8 -34.9 19 0.33 305 17.4 296.6 297.8 296.7
Line 119: 581.4 4407 -18.8 -30.3 36 0.53 306 16.8 297.0 298.8 297.1
Line 120: 580.1 4425 -18.9 -30.3 36 0.53 306 16.8 297.1 298.8 297.1
Line 121: 575.1 4490 -19.4 -30.4 37 0.53 305 17.1 297.2 299.0 297.3
Line 122: 559.9 4685 -20.8 -31.6 37 0.49 302 17.8 297.8 299.5 297.9
Line 123: 553.1 4781 -21.1 -32.8 34 0.44 300 18.2 298.5 300.0 298.6
Line 124: 551.0 4804 -21.1 -33.1 33 0.43 300 18.1 298.8 300.3 298.9
Line 125: 543.0 4910 -22.0 -34.3 32 0.39 300 17.7 299.0 300.4 299.1
Line 126: 533.9 5029 -23.1 -33.2 39 0.44 299 17.3 299.2 300.7 299.2
Line 127: 501.0 5491 -26.5 -33.8 50 0.44 300 16.8 300.5 302.0 300.6
Line 128: 500.0 5505 -26.6 -33.8 51 0.44 300 16.9 300.5 302.1 300.6
Line 129: 495.1 5585 -27.0 -34.0 51 0.44 299 16.9 300.9 302.4 301.0
Line 130: 477.6 5852 -29.4 -36.1 52 0.37 296 17.2 301.1 302.3 301.1
Line 131: 476.5 5872 -29.6 -36.3 52 0.36 296 17.2 301.0 302.3 301.1
Line 132: 451.9 6253 -32.0 -39.0 50 0.29 299 19.9 302.6 303.6 302.6
Line 133: 451.5 6259 -32.0 -39.0 50 0.29 299 20.0 302.7 303.7 302.7
Line 134: 428.8 6616 -35.2 -41.9 50 0.23 305 20.2 303.1 303.9 303.1
Line 135: 427.6 6635 -35.3 -42.0 50 0.22 305 20.2 303.2 304.0 303.2
Line 136: 400.0 7078 -38.1 -45.7 45 0.16 302 21.7 305.4 306.0 305.4
Line 137: 394.7 7171 -38.8 -46.5 44 0.15 302 22.0 305.6 306.2 305.7
Line 138: 392.8 7210 -39.2 -46.9 44 0.14 302 22.2 305.5 306.1 305.6
Line 139: 360.7 7778 -44.7 -51.2 48 0.10 302 23.6 305.7 306.1 305.7
Line 140: 349.9 7986 -45.9 -52.3 48 0.09 303 24.0 306.8 307.1 306.8
Line 141: 345.0 8076 -46.5 -53.3 46 0.08 303 23.9 307.2 307.5 307.2
Line 142: 330.6 8373 -49.3 -56.5 43 0.06 303 23.8 307.1 307.3 307.1
Line 143: 323.4 8509 -50.7 -57.9 42 0.05 304 23.4 307.1 307.3 307.1
Line 144: 314.2 8709 -51.7 -59.1 41 0.04 306 22.8 308.3 308.4 308.3
Line 145: 311.7 8761 -51.4 -59.0 40 0.04 307 22.7 309.4 309.6 309.4
Line 146: 307.4 8851 -51.1 -58.7 40 0.05 307 22.2 311.0 311.2 311.1
Line 147: 301.3 8968 -51.8 -59.5 39 0.04 307 21.5 311.8 312.0 311.9
Line 148: 300.0 8990 -51.7 -59.4 39 0.04 307 21.4 312.4 312.5 312.4
Line 149: 291.9 9157 -51.3 -59.3 38 0.04 307 20.5 315.4 315.6 315.4
Line 150: 285.7 9286 -51.2 -59.5 36 0.04 305 19.8 317.5 317.7 317.5
Line 151: 274.2 9565 -53.0 -61.7 34 0.03 302 18.3 318.6 318.8 318.6
Line 152: 271.4 9619 -53.6 -62.3 34 0.03 302 17.9 318.7 318.8 318.7
Line 153: 268.6 9700 -53.6 -62.4 33 0.03 303 17.4 319.6 319.8 319.6
Line 154: 261.9 9863 -54.6 -63.5 32 0.03 304 16.3 320.5 320.6 320.5
Line 155: 258.3 9952 -54.6 -63.6 32 0.03 305 15.7 321.7 321.9 321.8
Line 156: 257.3 9972 -54.8 -63.9 31 0.03 305 15.6 321.8 321.9 321.8
Line 157: 250.0 10164 -55.7 -64.9 31 0.02 304 15.8 323.1 323.2 323.1
Line 158: 242.3 10381 -56.4 -65.7 30 0.02 303 16.1 325.0 325.1 325.0
Line 159: 241.0 10422 -56.6 -66.0 30 0.02 302 16.4 325.2 325.3 325.2
Line 160: 233.0 10641 -55.9 -65.5 29 0.02 298 18.3 329.4 329.5 329.4
Line 161: 227.2 10792 -56.3 -66.0 28 0.02 295 19.6 331.2 331.3 331.2
Line 162: 221.3 10948 -56.4 -66.2 28 0.02 295 19.8 333.5 333.6 333.5
Line 163: 219.5 11010 -54.4 -64.4 28 0.03 295 19.9 337.4 337.5 337.4
Line 164: 207.6 11345 -55.2 -65.6 26 0.03 296 21.0 341.5 341.7 341.5
Line 165: 201.9 11516 -54.4 -65.1 26 0.03 297 22.1 345.5 345.7 345.5
Line 166: 200.0 11585 -54.7 -65.5 25 0.03 297 22.5 346.0 346.1 346.0
Line 167: 197.0 11669 -55.5 -66.3 25 0.03 297 22.8 346.2 346.3 346.2
Line 168: 193.2 11797 -54.9 -66.0 24 0.03 296 23.1 349.1 349.2 349.1
Line 169: 185.4 12039 -56.2 -67.5 23 0.02 295 23.7 351.1 351.2 351.1
Line 170: 184.4 12067 -56.2 -67.5 23 0.02 294 23.8 351.7 351.8 351.7
Line 171: 179.1 12237 -54.6 -66.3 22 0.03 290 23.9 357.2 357.4 357.2
Line 172: 173.4 12465 -53.8 -65.7 22 0.03 285 24.1 361.9 362.0 361.9
Line 173: 171.2 12557 -53.6 -65.6 22 0.03 283 25.3 363.5 363.7 363.5
Line 174: 169.1 12621 -54.3 -66.4 21 0.03 282 26.1 363.6 363.8 363.7
Line 175: 162.1 12891 -54.1 -66.5 20 0.03 276 29.6 368.4 368.6 368.4
Line 176: 161.5 12919 -53.7 -66.2 20 0.03 276 29.7 369.5 369.6 369.5
Line 177: 153.5 13253 -55.3 -68.0 19 0.03 278 30.3 372.1 372.3 372.1
Line 178: 152.2 13317 -55.0 -67.8 19 0.03 278 30.4 373.6 373.7 373.6
Line 179: 150.0 13425 -54.8 -67.7 19 0.03 279 29.3 375.5 375.6 375.5
Line 180: 149.5 13450 -54.8 -67.7 19 0.03 279 29.1 375.8 376.0 375.8
Line 181: 143.2 13723 -56.4 -69.5 18 0.02 280 26.3 377.7 377.8 377.7
Line 182: 142.2 13776 -56.6 -69.7 18 0.02 280 26.0 378.1 378.2 378.1
Line 183: 139.0 13895 -55.5 -68.9 17 0.03 279 25.6 382.5 382.6 382.5
Line 184: 134.6 14122 -55.5 -69.0 17 0.03 277 24.7 386.0 386.2 386.0
Line 185: 129.1 14393 -56.0 -69.7 16 0.02 279 24.8 389.7 389.9 389.8
Line 186: 119.8 14865 -54.6 -68.7 16 0.03 285 25.5 400.7 400.9 400.7
Line 187: 110.7 15317 -55.7 -70.1 15 0.03 289 25.4 407.8 408.0 407.8
Line 188: 109.1 15403 -56.2 -70.6 15 0.03 289 25.3 408.6 408.7 408.6
Line 189: 105.9 15596 -55.4 -70.0 15 0.03 288 25.0 413.6 413.7 413.6
Line 190: 101.3 15921 -56.6 -71.2 14 0.03 287 24.3 416.6 416.7 416.6
Line 191: 100.0 16006 -56.3 -71.0 14 0.03 287 24.1 418.7 418.8 418.7
Line 192: </PRE>
Line 193:
Line 194: <div id="footer">
Line 195: <p/>
Line 196: <HR SIZE="1">
Line 197: <I>Interested in graduate studies in atmospheric science?
Line 198: Check out our program at the
Line 199: <a href="http://www.uwyo.edu/atsc/howtoapply/"
Line 200: target=_top>University of Wyoming
Line 201: </a></I>
Line 202: <HR SIZE="1"><FONT SIZE="-1">
Line 203: Questions about the weather data provided by this site can be
Line 204: addressed to <A HREF="mailto:ldoolman@uwyo.edu">
Line 205: Larry Oolman (ldoolman@uwyo.edu)</A></FONT>
Line 206: <HR SIZE="1">
Line 207: </div>
Line 208:
Line 209: </div>
Line 210: </BODY>
Line 211: </HTML>
如上,我们知道需要获取的是76-191行的数据,pandas启动
关键参数:skiprows为跳过正数多少行
skipfooter:跳过倒数多少行
例如我们需要76-191行,则是前面跳过75,结尾跳过211-191=20行
In [5]:
import pandas as pd
data = pd.read_csv('/home/mw/project/test.txt', sep='\s+', header=None, skiprows=75, skipfooter=20,
names=['P', 'HT', 'TEMP', 'DWPT', 'RH', 'Q', 'DRCT', 'WS', 'THTA', 'THTE', 'THTV'],
engine='python')
data
Out[5]:
P | HT | TEMP | DWPT | RH | Q | DRCT | WS | THTA | THTE | THTV | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1022.5 | 34 | -8.3 | -13.9 | 64 | 1.28 | 33 | 1.9 | 263.2 | 266.7 | 263.4 |
1 | 1019.9 | 52 | -6.4 | -12.9 | 60 | 1.39 | 42 | 1.7 | 265.3 | 269.1 | 265.5 |
2 | 1013.4 | 101 | -4.7 | -12.2 | 56 | 1.48 | 67 | 1.4 | 267.4 | 271.6 | 267.7 |
3 | 1005.4 | 164 | -4.4 | -13.2 | 50 | 1.38 | 98 | 0.9 | 268.3 | 272.2 | 268.6 |
4 | 1004.6 | 169 | -4.4 | -13.2 | 50 | 1.38 | 100 | 0.9 | 268.4 | 272.3 | 268.6 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
111 | 110.7 | 15317 | -55.7 | -70.1 | 15 | 0.03 | 289 | 25.4 | 407.8 | 408.0 | 407.8 |
112 | 109.1 | 15403 | -56.2 | -70.6 | 15 | 0.03 | 289 | 25.3 | 408.6 | 408.7 | 408.6 |
113 | 105.9 | 15596 | -55.4 | -70.0 | 15 | 0.03 | 288 | 25.0 | 413.6 | 413.7 | 413.6 |
114 | 101.3 | 15921 | -56.6 | -71.2 | 14 | 0.03 | 287 | 24.3 | 416.6 | 416.7 | 416.6 |
115 | 100.0 | 16006 | -56.3 | -71.0 | 14 | 0.03 | 287 | 24.1 | 418.7 | 418.8 | 418.7 |
116 rows × 11 columns
批处理代码
In [ ]:
import os
import pandas as pd
import numpy as np
# 指定文件夹路径
folder_path = '/home/mw/project'
# 获取文件夹中的所有文件名
file_list = os.listdir(folder_path)
# 用于存储数据的列表
data = []
# 逐个读取文件并存储为数据框架
for file in file_list:
file_path = os.path.join(folder_path, file)
output_dir ='/home/mw/project/New Folder'
output_filename = file[-22:-4] + '.csv'
output_path = os.path.join(output_dir, output_filename)
df = pd.read_csv(file_path, sep='\s+', header=None, skiprows=75, skipfooter=20,
names=['P', 'HT', 'TEMP', 'DWPT', 'RH', 'Q', 'DRCT', 'WS', 'THTA', 'THTE', 'THTV'])
print(df)
df.fillna(-999999,inplace=True)
df.to_csv(output_filename)
完结撒花